# Load data

In [2]:
# load train.csv and test.csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df = pd.read_csv("data/train.csv")
X = df.drop(['loan_status', 'id'], axis=1)
y = df['loan_status']

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file', 'loan_grade']
numerical_features = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income','cb_person_cred_hist_length' ]
def create_preprocessing_pipeline(categorical_features, numerical_features):
    # Numerical pipeline
    num_pipeline = Pipeline([
        ('scaler', StandardScaler())
    ])

    # Categorical pipeline
    cat_pipeline = Pipeline([
        ('onehot', OneHotEncoder(drop='first', sparse_output=False))
    ])

    # Combine pipelines
    preprocessor = ColumnTransformer([
        ('num', num_pipeline, numerical_features),
        ('cat', cat_pipeline, categorical_features)
    ])

    return preprocessor

preprocessor = create_preprocessing_pipeline(categorical_features, numerical_features)

# Tune individual classifiers

In [11]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, make_scorer

def finetune(model_name, ModelClass, params_dict, X_train, y_train, n_iter=10, cv=10, n_jobs=-1):
    """
    Fine-tune a model using RandomizedSearchCV with ROC AUC scoring.
    
    Args:
    model_name (str): Name of the model for logging purposes.
    model_class: The class of the model to be fine-tuned.
    params_dict (dict): Dictionary mapping parameter names to their possible values.
    X_train: Training features.
    y_train: Training labels.
    n_iter (int): Number of parameter settings sampled. Default is 10.
    cv (int): Number of cross-validation folds. Default is 5.
    n_jobs (int): Number of jobs to run in parallel. Default is -1 (use all processors).
    
    Returns:
    best_model: The best model found by RandomizedSearchCV.
    best_params (dict): The best parameters found.
    best_score (float): The best ROC AUC score achieved.
    """
    
    # Create the model instance
    model = ModelClass()
    pipeline = Pipeline([('preprocessor', preprocessor), ('model', model)])
    
    # Create ROC AUC scorer
    roc_auc_scorer = make_scorer(roc_auc_score)
    
    # Set up RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=params_dict,
        n_iter=n_iter,
        cv=cv,
        scoring=roc_auc_scorer,
        n_jobs=n_jobs,
        random_state=42,
        verbose=1
    )
    
    # Fit RandomizedSearchCV
    random_search.fit(X_train, y_train)
    
    # Get the best model, parameters, and score
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    best_score = random_search.best_score_
    
    print(f"Best parameters for {model_name}:")
    for param, value in best_params.items():
        print(f"{param}: {value}")
    print(f"Best ROC AUC score: {best_score}")
    
    return best_model, best_params, best_score

In [25]:
y.value_counts()

loan_status
0    50295
1     8350
Name: count, dtype: int64

In [31]:
from sklearn.ensemble import RandomForestClassifier

cls_random_forest, cls_random_forest_params, cls_random_forest_score = finetune(
    model_name='Random Forest',
    ModelClass=RandomForestClassifier,
    params_dict={
        'model__n_estimators': [100, 200, 300, 500, 1000],
        'model__max_depth': [None, 10, 20, 30, 40, 50],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__class_weight': [None, 'balanced', 'balanced_subsample'],
        # 'model__max_features': ['sqrt', 'log2', None],
        # 'model__bootstrap': [True, False],
        # 'model__criterion': ['gini', 'entropy'],
        # 'model__max_leaf_nodes': [None, 50, 100, 200]
    },
    X_train=X_train,
    y_train=y_train,
)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Best parameters for Random Forest:
model__n_estimators: 300
model__min_samples_split: 5
model__min_samples_leaf: 2
model__max_depth: 50
model__class_weight: None
Best ROC AUC score: 0.9498465800267258


Best score: 0.9498

Best params:
- model__n_estimators: 300
- model__min_samples_split: 5
- model__min_samples_leaf: 2
- model__max_depth: 50
- model__class_weight: None

In [32]:
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint, uniform

cls_decision_tree, cls_decision_tree_params, cls_decision_tree_score = finetune(
    model_name='Decision Tree',
    ModelClass=DecisionTreeClassifier,
    params_dict={
        'model__criterion': ['gini', 'entropy'],
        'model__splitter': ['best', 'random'],
        'model__max_depth': [None, 10, 20, 30, 40, 50],
        'model__min_samples_split': randint(2, 11),
        'model__min_samples_leaf': randint(1, 5),
        'model__class_weight': [None, 'balanced'],
        # 'model__max_features': ['sqrt', 'log2', None],
        # 'model__max_leaf_nodes': [None, 50, 100, 200],
        # 'model__min_impurity_decrease': uniform(0, 0.1),
        # 'model__ccp_alpha': uniform(0, 0.05)
    },
    X_train=X_train,
    y_train=y_train,
)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best parameters for Decision Tree:
model__class_weight: None
model__criterion: gini
model__max_depth: 10
model__min_samples_leaf: 3
model__min_samples_split: 8
model__splitter: best
Best ROC AUC score: 0.9455623472638015


Best score: 0.9455

Best params:
```
model__class_weight: None
model__criterion: gini
model__max_depth: 10
model__min_samples_leaf: 3
model__min_samples_split: 8
model__splitter: best
```

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

cls_gradient_boosting, cls_gradient_boosting_params, cls_gradient_boosting_score = finetune(
    model_name='Gradient Boosting',
    ModelClass=GradientBoostingClassifier,
    params_dict={
        'model__n_estimators': randint(100, 1000),
        'model__learning_rate': uniform(0.01, 0.3),
        'model__max_depth': randint(3, 10),
        'model__min_samples_split': randint(2, 20),
        'model__min_samples_leaf': randint(1, 10),
        'model__subsample': uniform(0.5, 0.5),  # This results in a range from 0.5 to 1.0
        # 'model__max_features': ['sqrt', 'log2', None],
        # 'model__loss': ['deviance', 'exponential'],
        # 'model__criterion': ['friedman_mse', 'mse', 'mae'],
        # 'model__n_iter_no_change': [None, 5, 10, 20],
        # 'model__validation_fraction': uniform(0.1, 0.2),  # This results in a range from 0.1 to 0.3
        # 'model__tol': uniform(1e-5, 1e-3)
    },
    X_train=X_train,
    y_train=y_train,
)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best parameters for Gradient Boosting:
model__learning_rate: 0.027425083650459835
model__max_depth: 7
model__min_samples_leaf: 4
model__min_samples_split: 9
model__n_estimators: 763
model__subsample: 0.8254442364744264
Best ROC AUC score: 0.9519993619313833


```
Best parameters for Gradient Boosting:
model__learning_rate: 0.027425083650459835
model__max_depth: 7
model__min_samples_leaf: 4
model__min_samples_split: 9
model__n_estimators: 763
model__subsample: 0.8254442364744264
Best ROC AUC score: 0.951999361931383
```

In [34]:
from xgboost import XGBClassifier

cls_xgboost, cls_xgboost_params, cls_xgboost_score = finetune(
    model_name='XGBoost',
    ModelClass=XGBClassifier,
    params_dict={
        'model__n_estimators': randint(100, 1000),
        'model__max_depth': randint(3, 10),
        'model__learning_rate': uniform(0.01, 0.3),
        'model__subsample': uniform(0.5, 0.5),  # This results in a range from 0.5 to 1.0
        'model__colsample_bytree': uniform(0.5, 0.5),
        'model__gamma': uniform(0, 5),
        # 'model__min_child_weight': randint(1, 10),
        # 'model__reg_alpha': [0, 0.1, 1, 10, 100],
        # 'model__reg_lambda': [0, 0.1, 1, 10, 100],
        # 'model__scale_pos_weight': uniform(0.5, 10.5),  # Adjust based on class imbalance
        # 'model__max_delta_step': randint(0, 10),
        # 'model__tree_method': ['auto', 'exact', 'approx', 'hist'],
        # 'model__booster': ['gbtree', 'gblinear', 'dart'],
        # 'model__objective': ['binary:logistic'],
        # 'model__eval_metric': ['auc', 'logloss']
    },
    X_train=X_train,
    y_train=y_train,
)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best parameters for XGBoost:
model__colsample_bytree: 0.6872700594236812
model__gamma: 4.75357153204958
model__learning_rate: 0.22959818254342154
model__max_depth: 7
model__n_estimators: 120
model__subsample: 0.5780093202212182
Best ROC AUC score: 0.9517436210857417


```
model__colsample_bytree: 0.6872700594236812
model__gamma: 4.75357153204958
model__learning_rate: 0.22959818254342154
model__max_depth: 7
model__n_estimators: 120
model__subsample: 0.5780093202212182
Best ROC AUC score: 0.9517436210857417
```

In [35]:
from lightgbm import LGBMClassifier

cls_lightgbm, cls_lightgbm_params, cls_lightgbm_score = finetune(
    model_name='LightGBM',
    ModelClass=LGBMClassifier,
    params_dict={
        'model__num_leaves': randint(20, 3000),
        'model__n_estimators': randint(100, 1000),
        'model__learning_rate': uniform(0.01, 0.3),
        'model__max_depth': randint(-1, 20),  # -1 means no limit
        'model__min_child_samples': randint(1, 50),
        'model__subsample': uniform(0.5, 0.5),  # This results in a range from 0.5 to 1.0
        'model__colsample_bytree': uniform(0.5, 0.5),
        # 'model__reg_alpha': uniform(0, 2),
        # 'model__reg_lambda': uniform(0, 2),
        # 'model__min_split_gain': uniform(0, 1),
        # 'model__subsample_freq': randint(0, 10),
        # 'model__boosting_type': ['gbdt', 'dart', 'goss'],
        # 'model__objective': ['binary'],
        # 'model__metric': ['auc', 'binary_logloss'],
        # 'model__feature_fraction': uniform(0.5, 0.5),
        # 'model__bagging_fraction': uniform(0.5, 0.5),
        # 'model__bagging_freq': randint(0, 10),
        # 'model__max_bin': randint(200, 300)
    },
    X_train=X_train,
    y_train=y_train,
)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[LightGBM] [Info] Number of positive: 6038, number of negative: 36187
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002782 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 909
[LightGBM] [Info] Number of data points in the train set: 42225, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142996 -> initscore=-1.790627
[LightGBM] [Info] Start training from score -1.790627
[LightGBM] [Info] Number of positive: 6037, number of negative: 36187
[LightGBM] [Info] Number of positive: 6037, number of negative: 36187
[LightGBM] [Info] Number of positive: 6037, number of negative: 36187
[LightGBM] [Info] Number of positive: 6037, number of negative: 36188
[LightGBM] [Info] Number of positive: 6037, number of negative: 36187
[LightGBM] [Info] Auto-choosing

```
model__colsample_bytree: 0.569746930326021
model__learning_rate: 0.09764339456056544
model__max_depth: 14
model__min_child_samples: 15
model__n_estimators: 289
model__num_leaves: 2754
model__subsample: 0.8091930046665436
Best ROC AUC score: 0.9501450102252722
```

In [36]:
from catboost import CatBoostClassifier

cls_catboost, cls_catboost_params, cls_catboost_score = finetune(
    model_name='CatBoost',
    ModelClass=CatBoostClassifier,
    params_dict={
        'model__iterations': randint(100, 1000),
        'model__depth': randint(4, 10),
        'model__learning_rate': uniform(0.01, 0.3),
        'model__l2_leaf_reg': uniform(1, 10),
        'model__border_count': randint(32, 255),
        'model__bagging_temperature': uniform(0, 1),
        # 'model__random_strength': uniform(0, 1),
        # 'model__grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide'],
        # 'model__min_data_in_leaf': randint(1, 20),
        # 'model__max_leaves': randint(10, 64),
        # 'model__rsm': uniform(0.1, 0.9),
        # 'model__leaf_estimation_method': ['Newton', 'Gradient'],
        # 'model__boosting_type': ['Ordered', 'Plain'],
        # 'model__bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS'],
        # 'model__subsample': uniform(0.5, 0.5),  # This results in a range from 0.5 to 1.0
        # 'model__scale_pos_weight': uniform(0.1, 1),  # Adjust based on class imbalance
        # 'model__eval_metric': ['AUC', 'Logloss'],
        # 'model__od_type': ['IncToDec', 'Iter'],
        # 'model__od_wait': randint(10, 50)
    },
    X_train=X_train,
    y_train=y_train,
)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
0:	learn: 0.6211609	total: 156ms	remaining: 26.5s
0:	learn: 0.6209408	total: 154ms	remaining: 26.2s
0:	learn: 0.6214088	total: 164ms	remaining: 27.9s
0:	learn: 0.6256719	total: 177ms	remaining: 33s
0:	learn: 0.6260482	total: 178ms	remaining: 33.1s
0:	learn: 0.6208540	total: 179ms	remaining: 30.4s
0:	learn: 0.6208139	total: 179ms	remaining: 30.5s
0:	learn: 0.6206656	total: 183ms	remaining: 31.1s
0:	learn: 0.6209605	total: 183ms	remaining: 31.1s
0:	learn: 0.6210182	total: 184ms	remaining: 31.3s
0:	learn: 0.6207879	total: 187ms	remaining: 31.8s
0:	learn: 0.6209260	total: 189ms	remaining: 32.1s
1:	learn: 0.5582344	total: 198ms	remaining: 16.8s
1:	learn: 0.5585740	total: 217ms	remaining: 18.3s
1:	learn: 0.5612090	total: 215ms	remaining: 18.2s
1:	learn: 0.5691031	total: 220ms	remaining: 20.4s
1:	learn: 0.5681481	total: 230ms	remaining: 21.2s
1:	learn: 0.5604803	total: 230ms	remaining: 19.4s
1:	learn: 0.5576922	total: 229ms	remain

```
Best parameters for CatBoost:
model__bagging_temperature: 0.023062425041415757
model__border_count: 250
model__depth: 6
model__iterations: 610
model__l2_leaf_reg: 2.3949386065204186
model__learning_rate: 0.09764339456056544
Best ROC AUC score: 0.9525535193455099
```

# Create ensemble

In [49]:
def to_params(params_dict):
    # remove model__ from params_dict key
    return {k.replace('model__', ''): v for k, v in params_dict.items()}

to_params(cls_catboost_params)

{'bagging_temperature': 0.023062425041415757,
 'border_count': 250,
 'depth': 6,
 'iterations': 610,
 'l2_leaf_reg': 2.3949386065204186,
 'learning_rate': 0.09764339456056544}

In [50]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

models = [
    ('Random Forest', RandomForestClassifier(**to_params(cls_random_forest_params))),
    ('Gradient Boosting', GradientBoostingClassifier(**to_params(cls_gradient_boosting_params))),
    ('Decision Tree', DecisionTreeClassifier(**to_params(cls_decision_tree_params))),
    ('XGBoost', XGBClassifier(**to_params(cls_xgboost_params))),
    ('LightGBM', LGBMClassifier(**to_params(cls_lightgbm_params))),
    ('CatBoost', CatBoostClassifier(**to_params(cls_catboost_params))),
]

pipes = {
    name: Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    for name, model in models
}

voting_clf = VotingClassifier(
    estimators=[(name, pipe) for name, pipe in pipes.items()],
    voting='soft'
)

# Evaluate performance of ensemble vs. individual classifiers

In [51]:
voting_clf.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 6708, number of negative: 40208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000200 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 906
[LightGBM] [Info] Number of data points in the train set: 46916, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142979 -> initscore=-1.790765
[LightGBM] [Info] Start training from score -1.790765
0:	learn: 0.5738795	total: 82.4ms	remaining: 50.2s
1:	learn: 0.4849286	total: 104ms	remaining: 31.6s
2:	learn: 0.4169875	total: 119ms	remaining: 24s
3:	learn: 0.3658051	total: 136ms	remaining: 20.5s
4:	learn: 0.3267156	total: 158ms	remaining: 19.1s
5:	learn: 0.2983623	total: 171ms	remaining: 17.2s
6:	learn: 0.2774114	total: 185ms	remaining: 15.9s
7:	learn: 0.2607523	total: 209ms	remaining: 15.7s
8:	learn: 0.2475694	total: 229ms	remaining: 1

In [103]:
from sklearn.metrics import roc_auc_score

def calulate_score(model, X_test, y_test):
    y_pred_proba = model.predict_proba(X_test)
    roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
    return roc_auc

In [104]:
import re
result = {}
voting_clf_score = calulate_score(voting_clf, X_test, y_test)
result['Voting Classifier'] = voting_clf_score

for cls in voting_clf.estimators_:
    score = calulate_score(cls, X_test, y_test)
    cls_name = str(cls.steps[1][1])
    pattern = r'(\w+)\([^)]*\)'
    cls_name = re.sub(pattern, r'\1', cls_name)
    result[cls_name] = score

In [105]:
for k, v in result.items():
    print(f'{k:<50} {v:.4f}')

Voting Classifier                                  0.9574
RandomForestClassifier                             0.9418
GradientBoostingClassifier                         0.9594
DecisionTreeClassifier                             0.9226
XGBClassifier                                      0.9560
LGBMClassifier                                     0.9504
<catboost.core.CatBoostClassifier object at 0x130984da0> 0.9562


In [107]:
train_acc = calulate_score(voting_clf, X_train, y_train)
test_acc = calulate_score(voting_clf, X_test, y_test)

print(f'Train ROC AUC: {train_acc:.4f}')
print(f'Test ROC AUC: {test_acc:.4f}')

Train ROC AUC: 0.9983
Test ROC AUC: 0.9574


In [109]:
def submit(model, posfix):
    test_df = pd.read_csv("data/test.csv")
    y_pred = model.predict(test_df)
    submission = pd.DataFrame({
        'id': test_df['id'],
        'loan_status': y_pred
    })
    submission.to_csv(f'data/submission-{posfix}.csv', index=False)
    print(f"Predictions saved to submission-{posfix}.csv")
    return submission

In [110]:
submit(voting_clf, 'voting')

Predictions saved to submission-voting.csv


Unnamed: 0,id,loan_status
0,58645,1
1,58646,0
2,58647,1
3,58648,0
4,58649,0
...,...,...
39093,97738,0
39094,97739,0
39095,97740,0
39096,97741,0
