In [9]:
import pandas as pd
import numpy as np 
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, confusion_matrix

In [2]:
df = pd.read_csv('cleaned_data.csv', index_col = 0)

In [3]:
df.head()

Unnamed: 0,ReportAsOfEOD,LoanId,ListedOnUTC,UserName,LoanApplicationStartedDate,LoanDate,ContractEndDate,FirstPaymentDate,MaturityDate_Original,MaturityDate_Last,...,NoOfPreviousLoansBeforeLoan,AmountOfPreviousLoansBeforeLoan,PreviousRepaymentsBeforeLoan,PreviousEarlyRepaymentsBefoleLoan,PreviousEarlyRepaymentsCountBeforeLoan,NextPaymentNr,NrOfScheduledPayments,PrincipalDebtServicingCost,InterestAndPenaltyDebtServicingCost,Status
0,442936,493740,65066,617325,64754,65019,342892,67743,397352,342892,...,-0.228369,-0.538027,-0.089477,-0.123961,-0.183984,-0.331731,0.387925,-0.079749,-0.061823,0.0
1,442936,649499,52716,627398,52683,52704,685972,55242,345576,345576,...,-0.228369,-0.272026,-0.179738,2.97886,2.751157,-0.331731,0.387925,-0.079749,-0.080312,0.0
2,442936,516505,61551,610980,61261,61739,267810,63344,376157,376157,...,-0.625141,-0.640336,-0.455593,-0.123961,-0.183984,-0.331731,0.387925,5.166327,6.331205,1.0
3,442936,516446,49131,608887,48942,49270,685972,50821,331086,331086,...,-0.625141,-0.640336,-0.455593,1.427449,2.751157,-0.331731,0.387925,-0.079749,0.587551,0.0
4,442936,453833,60460,631617,60275,60636,685972,62257,298077,298077,...,-0.625141,-0.640336,-0.455593,-0.123961,-0.183984,-0.331731,0.387925,-0.079749,-0.267506,0.0


In [4]:
df.dropna(inplace = True)

In [5]:
df.shape

(121461, 102)

In [6]:
train_df, test_df = train_test_split(df, test_size = 0.2, random_state = 42, stratify = df['Status'])

In [7]:
print(train_df.shape)
print(test_df.shape)
print(train_df['Status'].value_counts())

(97168, 102)
(24293, 102)
0.0    54859
1.0    42309
Name: Status, dtype: int64


In [None]:
def objective(trial):
    kf = KFold(n_splits = 5)
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 12)
    gamma = trial.suggest_float('gamma', 0.01, 2)
    learning_rate  = trial.suggest_float('learning_rate', 0.01, 0.3)
    f1_scores = []
    for train_index, test_index in kf.split(train_df):
        X_train, Y_train = train_df.iloc[train_index].loc[:, train_df.columns != 'Status'], train_df.iloc[train_index]['Status'] 
        X_test, Y_test = train_df.iloc[test_index].loc[:, train_df.columns != 'Status'], train_df.iloc[test_index]['Status'] 
        clf = xgb.XGBClassifier(n_estimators = n_estimators,
                                   max_depth = max_depth,
                                   gamma = gamma,
                                   learning_rate = learning_rate)
        clf.fit(X_train, Y_train)
        Y_preds = clf.predict(X_test)
        f1_scores.append(f1_score(Y_test, Y_preds))
    return np.mean(f1_scores)


study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 10)

In [10]:
# best_params = study.best_params
# print(best_params)
X_train, Y_train = train_df.loc[:, train_df.columns != 'Status'], train_df.loc[:, 'Status'].ravel()
X_test, Y_test = test_df.loc[:, train_df.columns != 'Status'], test_df.loc[:, 'Status'].ravel()
clf = xgb.XGBClassifier(n_estimators = 225,
                       max_depth = 4,
                       gamma = 1,
                       learning_rate = 0.22)
# clf.load_model('model.json')
clf.fit(X_train, Y_train)
Y_preds = clf.predict(X_test)


display(confusion_matrix(Y_test, Y_preds))
print(f1_score(Y_test, Y_preds))
print(precision_score(Y_test, Y_preds))
print(recall_score(Y_test, Y_preds))
print(accuracy_score(Y_test, Y_preds))




array([[13713,     2],
       [    1, 10577]], dtype=int64)

0.9998582029588317
0.999810946214198
0.9999054641709207
0.9998765076359445
