# Model building / selection

We will now look at building/selecting our models, as we have a classifcation task (predicting 0 or 1) we will compare a verity of classifcation algorithms like
* Random Forest
* Gradient boosting machines
* Adaboost 
* Logistic regression

and so on.

As we are interested in roc_auc scores we will predict probabilities from our models.

In [58]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score
import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
import warnings 
warnings.filterwarnings("ignore")

DATA_DIR = "../Data/"

In [59]:
train_data = pd.read_csv(DATA_DIR + "processed_train.csv")
test_data = pd.read_csv(DATA_DIR + "processed_test.csv")

We will now make a function for easily testing different models.

In [21]:
def train_and_test_model(train_data : pd.DataFrame,  
                         model, 
                         print : bool= True,
                         **model_paramaters):
    '''
    Function for fitting data to a model using stratified k fold with 10 folds.
    Args:
    train_data (pd.DataFrame)
    test_data (pd.DataFrame)
    model (sklearn style model)
    model_paramaters (dictionary)
    '''
    X_train = train_data.drop("loan_status", axis = 1)
   
    y_train = train_data["loan_status"]


    model.set_params(**model_paramaters)

    strat_k_fold = StratifiedKFold(10, shuffle= True)
    train_auc_scores = []
    val_auc_scores = []

    for fold , (train_ind, val_inx) in enumerate(strat_k_fold.split(X_train, y_train), 1):
        x_train_fold, x_val_fold = X_train.iloc[train_ind], X_train.iloc[val_inx]
        y_train_fold, y_val_fold = y_train.iloc[train_ind], y_train.iloc[val_inx]

        model.fit(x_train_fold, y_train_fold)

        y_train_pred = model.predict_proba(x_train_fold)[:, 1]
        train_auc = roc_auc_score(y_train_fold, y_train_pred)
        train_auc_scores.append(train_auc)

        y_val_pred = model.predict_proba(x_val_fold)[:, 1]
        val_auc = roc_auc_score(y_val_fold, y_val_pred)
        val_auc_scores.append(val_auc)
        if print:
            print(f"Fold {fold}: Train AUC = {train_auc:.4f}, Validation AUC = {val_auc:.4f}")
    mean_train_auc = np.mean(train_auc_scores)
    mean_val_auc = np.mean(val_auc_scores)
    if print:
        print(f"\nMean Train AUC: {mean_train_auc:.4f}")
        print(f"Mean Validation AUC: {mean_val_auc:.4f}")
    return mean_val_auc

We will train our model using 10 stratified Kfolds on our training data, using the mean validation ROC AUC as our model comparison criteria. First we will just test our models with no hyperparamaters tuning to see which one does the best oob, and hyperparamater tune that model from there.

In [40]:
models = [xgb.XGBClassifier(),
          RandomForestClassifier(),
          KNeighborsClassifier(5),
          GradientBoostingClassifier(),
          AdaBoostClassifier(),
          LogisticRegression(),
          BaggingClassifier()
 ]




model_names = []
model_val_roc = []


for model in models:
    val_roc_auc = train_and_test_model(train_data, model, print = False)

    model_names.append(type(model).__name__)
    model_val_roc.append(val_roc_auc)

In [41]:
model_pred = pd.DataFrame({"Model" : model_names, "VAL_ROC" :model_val_roc} )
model_pred.sort_values(by="VAL_ROC", ascending= False)

Unnamed: 0,Model,VAL_ROC
0,XGBClassifier,0.952658
3,GradientBoostingClassifier,0.939588
1,RandomForestClassifier,0.934766
4,AdaBoostClassifier,0.921986
6,BaggingClassifier,0.911697
5,LogisticRegression,0.840676
2,KNeighborsClassifier,0.732367


We see that our XGBclassifer does significantly better OOB then the others do. So we will now tune XGBclassifier from here to achieve the highest auc roc possible.

To do so we will make use of optuna optimisation.

In [45]:
import optuna 

def objective(trial):

    xgboost_params = {
        "objective" : "binary:logistic",
        "eval_metric" : "auc",
        "device" : "cuda",
        "eta" : trial.suggest_loguniform("eta", 1e-4, 1 ),
        "alpha" : trial.suggest_loguniform("alpha", 1e-3, 10),
        "num_leaves" : trial.suggest_categorical("num_leaves", [10,20,30,40,50,70, 100]),
        "learning_rate" : trial.suggest_loguniform("learning_rate", 1e-4, 1),
        "max_depth" : trial.suggest_categorical("max_depth", [5,7,9,11,13]),
        "n_estimators" : trial.suggest_categorical("n_estimators" , [10, 20, 50, 100, 500, 1000])
    }
    model = xgb.XGBClassifier()
    auc = train_and_test_model(train_data, model, print = False, model_paramaters= xgboost_params)
    return auc

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials = 30)




[I 2024-10-15 11:31:43,106] A new study created in memory with name: no-name-7bafef55-570d-457b-8f2b-9fbd5ee361c5
[I 2024-10-15 11:31:45,766] Trial 0 finished with value: 0.9522063704801743 and parameters: {'eta': 0.0004618504768450964, 'alpha': 1.4392104489452895, 'num_leaves': 20, 'learning_rate': 0.006901748808690155, 'max_depth': 11, 'n_estimators': 500}. Best is trial 0 with value: 0.9522063704801743.
[I 2024-10-15 11:31:48,352] Trial 1 finished with value: 0.9521088975805899 and parameters: {'eta': 0.00039400641054660596, 'alpha': 0.07242828146600491, 'num_leaves': 70, 'learning_rate': 0.0004364648913994702, 'max_depth': 7, 'n_estimators': 500}. Best is trial 0 with value: 0.9522063704801743.
[I 2024-10-15 11:31:51,026] Trial 2 finished with value: 0.9524314523578153 and parameters: {'eta': 0.20955332879722616, 'alpha': 0.3572700235694673, 'num_leaves': 40, 'learning_rate': 0.9304422534888797, 'max_depth': 7, 'n_estimators': 100}. Best is trial 2 with value: 0.9524314523578153.
[

We will now train our model with the best paramaters on the whole training data before getting our submission file ready.

In [61]:
study.best_value


best_model = xgb.XGBClassifier(**study.best_params)
best_model.fit(X = train_data.drop("loan_status", axis = 1), y = train_data["loan_status"])
test_predictions = best_model.predict_proba(test_data.drop("id", axis = 1))[:, 1]
test_predictions_df = pd.DataFrame({"id": test_data["id"], "loan_status" : test_predictions})
test_predictions_df.to_csv("../Data/sample_submission.csv", index = False)

The above submission on kaggle got a public score of 0.93248, we can definatly do better then that.

In [71]:
import optuna 
def objective_2(trial):

    xgboost_params = {
        "objective" : "binary:logistic",
        "eval_metric" : "auc",
        "device" : "cuda",
        "eta" : trial.suggest_loguniform("eta", 1e-4, 1 ),
        "alpha" : trial.suggest_loguniform("alpha", 1e-3, 10),
        "num_leaves" : trial.suggest_categorical("num_leaves", [10,20,30,40,50,70, 100]),
        "learning_rate" : trial.suggest_loguniform("learning_rate", 1e-4, 1),
        "max_depth" : trial.suggest_categorical("max_depth", [5,7,9,11,13]),
        "n_estimators" : trial.suggest_categorical("n_estimators" , [10, 20, 50, 100, 500, 1000]),
        "weights" : trial.suggest_uniform("weights", 0, 100)
    }
    model = xgb.XGBClassifier()
    auc = train_and_test_model(train_data, model, print = False, model_paramaters= xgboost_params)
    return auc

study2 = optuna.create_study(direction="maximize")
study2.optimize(objective_2, n_trials = 30)




[I 2024-10-15 11:53:14,659] A new study created in memory with name: no-name-a2a7ddd8-42ec-4d1c-9ba5-eece0959107c
[I 2024-10-15 11:53:17,455] Trial 0 finished with value: 0.9523225184597303 and parameters: {'eta': 0.28877283516086444, 'alpha': 9.867500606929504, 'num_leaves': 70, 'learning_rate': 0.0012986351281651229, 'max_depth': 5, 'n_estimators': 1000, 'weights': 68.06732897247275}. Best is trial 0 with value: 0.9523225184597303.
[I 2024-10-15 11:53:20,244] Trial 1 finished with value: 0.9525493411910044 and parameters: {'eta': 0.0005893223338327801, 'alpha': 6.430164222042314, 'num_leaves': 10, 'learning_rate': 0.5890891879706068, 'max_depth': 7, 'n_estimators': 20, 'weights': 24.13973555072063}. Best is trial 1 with value: 0.9525493411910044.
[I 2024-10-15 11:53:23,189] Trial 2 finished with value: 0.9524872896614406 and parameters: {'eta': 0.001452760636179762, 'alpha': 0.9767012298076453, 'num_leaves': 100, 'learning_rate': 0.0009857325847804957, 'max_depth': 13, 'n_estimators'

In [72]:
study2.best_value


best_model = xgb.XGBClassifier(**study2.best_params)
best_model.fit(X = train_data.drop("loan_status", axis = 1), y = train_data["loan_status"])
test_predictions = best_model.predict_proba(test_data.drop("id", axis = 1))[:, 1]
test_predictions_df = pd.DataFrame({"id": test_data["id"], "loan_status" : test_predictions})
test_predictions_df.to_csv("../Data/sample_submission.csv", index = False)

HOLY SMOKES!!! - Adding weights increased test score to 0.94227 a big jump of 0.01!

In [78]:
def objective_3(trial):

    xgboost_params = {
        "objective" : "binary:logistic",
        "eval_metric" : "auc",
        "device" : "cuda",
        "eta" : trial.suggest_loguniform("eta", 1e-4, 1 ),
        "alpha" : trial.suggest_loguniform("alpha", 1e-3, 10),
        "num_leaves" : trial.suggest_categorical("num_leaves", [50,70, 85, 100]),
        "learning_rate" : trial.suggest_loguniform("learning_rate", 1e-4, 1),
        "max_depth" : trial.suggest_int("max_depth", 10,30),
        "n_estimators" : trial.suggest_categorical("n_estimators" , [ 500, 1000]),
        "weights" : trial.suggest_uniform("weights", 0, 1000),
        "colsample_bytree" : trial.suggest_loguniform("colsample_bytree", 1e-4, 1),
        "scale_pos_weight" : trial.suggest_uniform("scale_pos_weight", 1e-4, 1),
        "lambda" : trial.suggest_loguniform("lambda", 1e-4, 1),
        "jobs" : -1
    }
    model = xgb.XGBClassifier()
    auc = train_and_test_model(train_data, model, print = False, model_paramaters= xgboost_params)
    return auc

study3 = optuna.create_study(direction="maximize")
study3.optimize(objective_3, n_trials = 50)




[I 2024-10-15 12:08:07,072] A new study created in memory with name: no-name-37ab9c27-ba23-4960-b92f-81084cad7650
[I 2024-10-15 12:08:10,672] Trial 0 finished with value: 0.9516286720388608 and parameters: {'eta': 0.007676713687949517, 'alpha': 0.018971192741137115, 'num_leaves': 50, 'learning_rate': 0.0002712515480180054, 'max_depth': 27.645601380919704, 'n_estimators': 1000, 'weights': 24.337160833883043, 'colsample_bytree': 0.0268140109201112, 'scale_pos_weight': 0.04670977514623896, 'lambda': 0.004151514560678005}. Best is trial 0 with value: 0.9516286720388608.
[I 2024-10-15 12:08:13,670] Trial 1 finished with value: 0.9531538575908544 and parameters: {'eta': 0.023038533251847015, 'alpha': 0.36512212524242865, 'num_leaves': 85, 'learning_rate': 0.0029813400105230792, 'max_depth': 11.862879835210059, 'n_estimators': 1000, 'weights': 371.5947515340989, 'colsample_bytree': 0.507303729175897, 'scale_pos_weight': 0.032902059887331706, 'lambda': 0.000174689915234299}. Best is trial 1 wi

In [79]:
study3.best_value


best_model = xgb.XGBClassifier(**study3.best_params)
best_model.fit(X = train_data.drop("loan_status", axis = 1), y = train_data["loan_status"])
test_predictions = best_model.predict_proba(test_data.drop("id", axis = 1))[:, 1]
test_predictions_df = pd.DataFrame({"id": test_data["id"], "loan_status" : test_predictions})
test_predictions_df.to_csv("../Data/sample_submission.csv", index = False)

XGBoostError: Invalid Parameter format for max_depth expect int but value='21.34293461300789'