In [69]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.abspath("../"))

from src.preprocess import preprocess
from xgboost import plot_importance
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import cross_val_score, StratifiedKFold
import optuna

from sklearn.metrics import roc_auc_score, roc_curve
import xgboost as xgb

# Approach

1. Establish baseline model with default hyperparameters
2. Select which hp to tune focusing on those that prevent overfitting since we have thin data
3. run tuning using optuna
4. evaluate performance using key metrics: AUC, Gini index


## See summary in model validation notebook

In [19]:
#load data into dataframe
filepath = '../data/raw_data.csv'
raw_df = pd.read_csv(filepath)

In [21]:
final_df = preprocess(raw_df)

In [23]:
# from feature selection notebook
selected_features=['OverdraftTotal',
 'BalanceAverage',
 'TotalCash',
 'CurrentBalance',
 'CurrentBalance_missing',
 'NumberOfMatches',
 'AverageMonthlyDiscretionarySpend',
 'LastRepaymentAmount',
 'ErrorRate',
 'NegativeBalanceCount',
 'SavingsAccountCount',
 'AveragePotentialMonthlyIncome',
 'CheckingAccountCount',
 'AverageNumberOfTransactionsADay',
 'TotalHistoryInDays',
 'Paycheck',
 'HasEmpowerBanking',
 'IsNameBased',
 'BalanceAbove100L30Count',
 'AverageMonthlySpend',
 'AverageMonthlyIncome',
 'AverageNumberOfTransactionsADayPrimaryChecking',
 'PaycheckModel_Tagging',
 'od_per_30d',
 'CreditAccounts',
 'BalanceMin',
 'OverdraftCount',
 'TotalAssets',
 'PaycheckModel_DeepSearch',
 'dep_wd_ratio',
 'bal_vol_index',
 'PaycheckModel_BruteForce',
 'OutstandingCreditDebtWherePayingInterest',
 'LatefeesTotalCount',
 'LatefeesCount',
 'DefaultedAdvances']

In [25]:
final_df = final_df[selected_features]

X_df = final_df.drop('DefaultedAdvances', axis=1)
y = final_df['DefaultedAdvances']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    X_df, y, test_size=0.20, random_state=75454)

In [36]:
model = xgb.XGBClassifier(n_estimators=100)
model.fit(X_train, y_train)

In [62]:
y_pred_proba = model.predict_proba(X_test)

auc = roc_auc_score(y_test, y_pred_proba[:,1])

gini = 2 * auc - 1

print(f'Baseline model AUC: {auc:.4f}')
print(f'Baseline model Gini: {gini:.4f}')
    

Baseline model AUC: 0.6858
Baseline model Gini: 0.3715


## Hyperparameter tuning

In [85]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 6),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 5, step=0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 100, log=True),
        'subsample': trial.suggest_float('subsample', 0.3, 0.9, step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9, step=0.1),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 1.0, step=0.1),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'verbosity': 0,
    }


    clf = xgb.XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(clf, X_train, y_train, cv=cv, scoring="roc_auc").mean()
    return score

# Run the optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30, timeout=300, show_progress_bar=True)

# Show results
print("Best trial:")
trial = study.best_trial
print(f"  AUC: {trial.value}")
print("  Best hyperparameters:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-05-22 08:02:52,602] A new study created in memory with name: no-name-cb775c39-416f-4d41-87db-9ac9b8993a8b


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-05-22 08:02:57,727] Trial 0 finished with value: 0.7165011066476369 and parameters: {'max_depth': 6, 'min_child_weight': 7, 'gamma': 0.0, 'reg_alpha': 0.33134671770049745, 'reg_lambda': 30.71361644959434, 'subsample': 0.7, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.5, 'learning_rate': 0.02950524236994106, 'n_estimators': 900}. Best is trial 0 with value: 0.7165011066476369.
[I 2025-05-22 08:02:58,006] Trial 1 finished with value: 0.7073555980530564 and parameters: {'max_depth': 2, 'min_child_weight': 9, 'gamma': 3.0, 'reg_alpha': 0.5127437862366949, 'reg_lambda': 5.853543617162443, 'subsample': 0.7, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.5, 'learning_rate': 0.07353121028669214, 'n_estimators': 100}. Best is trial 0 with value: 0.7165011066476369.
[I 2025-05-22 08:02:59,549] Trial 2 finished with value: 0.71715677034946 and parameters: {'max_depth': 5, 'min_child_weight': 10, 'gamma': 4.6000000000000005, 'reg_alpha': 2.965991861317631, 'reg_lambda': 6.5613564866487

In [87]:
best_params = study.best_params

In [91]:
final_model = xgb.XGBClassifier(
        **best_params,        
    )
final_model.fit(X_train, y_train)

In [93]:
y_pred_proba = final_model.predict_proba(X_test)

auc = roc_auc_score(y_test, y_pred_proba[:,1])

gini = 2 * auc - 1

print(f'Tuned model AUC: {auc:.4f}')
print(f'Tuned model Gini: {gini:.4f}')


Tuned model AUC: 0.7201
Tuned model Gini: 0.4401


In [98]:
val_df = pd.DataFrame({'y_true':y_test, 'y_pred':y_pred_proba[:,1]})

In [104]:
val_df.to_csv('../data/val_predictions.csv',index=False)

In [117]:
final_model.save_model('../artifacts/final_model.json')