In [19]:
import pandas as pd
df = pd.read_csv("data/train.csv")

In [20]:
df.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [21]:
X = df.drop(['id', 'loan_status'], axis=1)
y = df['loan_status']

In [22]:
X = pd.get_dummies(X)

In [23]:
X.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
0,37,35000,0.0,6000,11.49,0.17,14,False,False,False,...,False,False,True,False,False,False,False,False,True,False
1,22,56000,6.0,4000,13.35,0.07,2,False,False,True,...,False,False,False,True,False,False,False,False,True,False
2,29,28800,8.0,6000,8.9,0.21,10,False,False,True,...,False,True,False,False,False,False,False,False,True,False
3,30,70000,14.0,12000,11.11,0.17,5,False,False,False,...,True,False,True,False,False,False,False,False,True,False
4,22,60000,2.0,6000,6.92,0.1,3,False,False,False,...,False,True,False,False,False,False,False,False,True,False


In [24]:
import time
import lightgbm
import optuna
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.datasets import make_classification
import warnings

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.FATAL)
lightgbm.log_evaluation(False)

def objective_xgb(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }
    model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss')
    return np.mean(cross_val_score(model, X, y, cv=5, scoring='roc_auc'))

def objective_lgb(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'verbosity': -1
    }
    model = LGBMClassifier(**params)
    return np.mean(cross_val_score(model, X, y, cv=5, scoring='roc_auc'))

def objective_cat(trial):
    params = {
        'depth': trial.suggest_int('depth', 1, 9),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0),
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
    }
    model = CatBoostClassifier(**params, verbose=0)
    return np.mean(cross_val_score(model, X, y, cv=5, scoring='roc_auc'))

def optimize_and_predict(objective, model_class, n_trials=50):
    best_params_list = []
    start_time = time.time()
    for _ in range(10):
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=n_trials)
        best_params_list.append(study.best_params)
        
    print(f'Total time: {(time.time() - start_time)/60} minutes')
    predictions = []
    for params in best_params_list:
        model = model_class(**params)
        model.fit(X, y)
        predictions.append(model.predict_proba(X)[:, 1])  # Probability of class 1
    
    return np.mean(predictions, axis=0), best_params_list


In [25]:
print('Optimizing XGBoost...')
xgb_preds, xgb_params = optimize_and_predict(objective_xgb, XGBClassifier)

print('Optimizing LightGBM...')
lgb_preds, lgb_params = optimize_and_predict(objective_lgb, LGBMClassifier)

print('Optimizing CatBoost...')
cat_preds, cat_params = optimize_and_predict(objective_cat, CatBoostClassifier)

# Average predictions from all models
final_preds = (xgb_preds + lgb_preds  + cat_preds) / 3

# Convert probabilities to class predictions
roc_auc = roc_auc_score(y, final_preds)
print(f'ROC AUC (final): {roc_auc}')
print(f'ROC AUC (xgb): {roc_auc_score(y, xgb_preds)}')
print(f'ROC AUC (lgb): {roc_auc_score(y, lgb_preds)}')
print(f'ROC AUC (cat): {roc_auc_score(y, cat_preds)}')

Optimizing XGBoost...
Total time: 18.902604917685192 minutes
Optimizing LightGBM...
Total time: 29.833562982082366 minutes
Optimizing CatBoost...
Total time: 110.55643737713496 minutes
0:	learn: 0.4812876	total: 4.88ms	remaining: 4.29s
1:	learn: 0.3671757	total: 10.1ms	remaining: 4.44s
2:	learn: 0.3100897	total: 15ms	remaining: 4.39s
3:	learn: 0.2728318	total: 19.5ms	remaining: 4.26s
4:	learn: 0.2498130	total: 24ms	remaining: 4.19s
5:	learn: 0.2369152	total: 28.8ms	remaining: 4.2s
6:	learn: 0.2242356	total: 33.2ms	remaining: 4.14s
7:	learn: 0.2170520	total: 37.5ms	remaining: 4.09s
8:	learn: 0.2110772	total: 41.8ms	remaining: 4.05s
9:	learn: 0.2057096	total: 46.6ms	remaining: 4.05s
10:	learn: 0.2031706	total: 51.2ms	remaining: 4.04s
11:	learn: 0.2004231	total: 55.6ms	remaining: 4.02s
12:	learn: 0.1977694	total: 60.1ms	remaining: 4s
13:	learn: 0.1946426	total: 64.6ms	remaining: 4s
14:	learn: 0.1927334	total: 69.2ms	remaining: 3.99s
15:	learn: 0.1908861	total: 73.8ms	remaining: 3.98s
16:	

In [33]:
config = {
    'xgboost': xgb_params,
    'lightgbm': lgb_params,
    'catboost': cat_params
}

In [34]:
import json

with open('params.json', 'w') as f:
    json.dump(config, f)

In [72]:
class AveragedEnsemble:
    def __init__(self, config):
        self.models = []
        for model_name, param_list in config.items():
            model_variants = []
            for params in param_list:
                if model_name == 'xgboost':
                    model = XGBClassifier(**params)
                elif model_name == 'lightgbm':
                    model = LGBMClassifier(**params)
                elif model_name == 'catboost':
                    model = CatBoostClassifier(**params)
                model_variants.append(model)
            self.models.append(model_variants)

    def fit(self, X, y):
        for model_variants in self.models:
            [model.fit(X, y) for model in model_variants]

    def predict(self, X):
        final_preds = self.predict_proba(X)
        preds = np.where(final_preds > 0.5, 1, 0)
        return preds
    
    def predict_proba(self, X):
        predictions = []
        for model_variants in self.models:
            model_preds = []
            for model in model_variants:
                model_preds.append(model.predict_proba(X)[:, 1])
            predictions.append(np.mean(model_preds, axis=0))
        return np.mean(predictions, axis=0)

In [73]:
esm = AveragedEnsemble(config)
esm.fit(X, y)

0:	learn: 0.4812876	total: 3.82ms	remaining: 3.36s
1:	learn: 0.3671757	total: 7.42ms	remaining: 3.26s
2:	learn: 0.3100897	total: 10.6ms	remaining: 3.1s
3:	learn: 0.2728318	total: 14.1ms	remaining: 3.08s
4:	learn: 0.2498130	total: 17.4ms	remaining: 3.05s
5:	learn: 0.2369152	total: 21.1ms	remaining: 3.07s
6:	learn: 0.2242356	total: 24.2ms	remaining: 3.02s
7:	learn: 0.2170520	total: 27.6ms	remaining: 3.01s
8:	learn: 0.2110772	total: 30.6ms	remaining: 2.96s
9:	learn: 0.2057096	total: 34.3ms	remaining: 2.99s
10:	learn: 0.2031706	total: 38.1ms	remaining: 3.01s
11:	learn: 0.2004231	total: 41.6ms	remaining: 3.01s
12:	learn: 0.1977694	total: 44.8ms	remaining: 2.98s
13:	learn: 0.1946426	total: 48.4ms	remaining: 2.99s
14:	learn: 0.1927334	total: 51.8ms	remaining: 2.99s
15:	learn: 0.1908861	total: 55.3ms	remaining: 2.98s
16:	learn: 0.1887319	total: 58.7ms	remaining: 2.98s
17:	learn: 0.1867601	total: 62.3ms	remaining: 2.98s
18:	learn: 0.1852971	total: 65.3ms	remaining: 2.96s
19:	learn: 0.1846224	to

In [74]:
roc_auc_score(y, esm.predict_proba(X))

0.9827382657887328

In [48]:
test_df = pd.get_dummies(pd.read_csv("data/test.csv"))
test_df = test_df.drop('id', axis=1)
test_df.head()


Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
0,23,69000,3.0,25000,15.76,0.36,2,False,False,False,...,False,False,False,False,False,False,True,False,True,False
1,26,96000,6.0,10000,12.68,0.1,4,True,False,False,...,False,False,False,True,False,False,False,False,False,True
2,26,30000,5.0,4000,17.19,0.13,2,False,False,False,...,True,False,False,False,False,True,False,False,False,True
3,33,50000,4.0,7000,8.9,0.14,7,False,False,False,...,False,True,False,False,False,False,False,False,True,False
4,26,102000,8.0,15000,16.32,0.15,4,True,False,False,...,False,False,False,False,True,False,False,False,False,True


In [50]:
print(test_df.columns)
print(X.columns)

Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'person_home_ownership_MORTGAGE', 'person_home_ownership_OTHER',
       'person_home_ownership_OWN', 'person_home_ownership_RENT',
       'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION',
       'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
       'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_A',
       'loan_grade_B', 'loan_grade_C', 'loan_grade_D', 'loan_grade_E',
       'loan_grade_F', 'loan_grade_G', 'cb_person_default_on_file_N',
       'cb_person_default_on_file_Y'],
      dtype='object')
Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'person_home_ownership_MORTGAGE', 'person_home_ownership_OTHER',
       'person_home_ownership_OWN', 'person_home_ownership_RENT',
       'loan_intent_DEBTCONS

In [81]:
def submit(model):
    test_df = pd.read_csv("data/test.csv")
    X = test_df.drop('id', axis=1)
    X = pd.get_dummies(X)
    y_pred = model.predict(X)
    submission = pd.DataFrame({
        'id': test_df['id'],
        'loan_status': y_pred
    })
    submission.to_csv('data/submission.csv', index=False)
    print("Predictions saved to submission.csv")
    return submission

In [82]:
submit(esm)

Predictions saved to submission.csv


Unnamed: 0,id,loan_status
0,58645,1
1,58646,0
2,58647,1
3,58648,0
4,58649,0
...,...,...
39093,97738,0
39094,97739,0
39095,97740,0
39096,97741,0
