In [58]:
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_columns', 25)

#### Load the data

In [59]:
data_orginal = pd.read_excel(r'default of credit card clients.xls')

In [60]:
data = data_orginal.copy()

#### Exploratory Data Analysis

In [61]:
data[['LIMIT_BAL','SEX','EDUCATION','MARRIAGE','AGE','default']].describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
LIMIT_BAL,30000.0,,,,167484.3227,129747.6616,10000.0,50000.0,140000.0,240000.0,1000000.0
SEX,30000.0,2.0,female,18112.0,,,,,,,
EDUCATION,30000.0,5.0,university,14030.0,,,,,,,
MARRIAGE,30000.0,3.0,single,15964.0,,,,,,,
AGE,30000.0,,,,35.4855,9.2179,21.0,28.0,34.0,41.0,79.0
default,30000.0,,,,0.2212,0.4151,0.0,0.0,0.0,0.0,1.0


In [62]:
data[['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']].describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PAY_0,30000.0,-0.0167,1.1238,-2.0,-1.0,0.0,0.0,8.0
PAY_2,30000.0,-0.1338,1.1972,-2.0,-1.0,0.0,0.0,8.0
PAY_3,30000.0,-0.1662,1.1969,-2.0,-1.0,0.0,0.0,8.0
PAY_4,30000.0,-0.2207,1.1691,-2.0,-1.0,0.0,0.0,8.0
PAY_5,30000.0,-0.2662,1.1332,-2.0,-1.0,0.0,0.0,8.0
PAY_6,30000.0,-0.2911,1.15,-2.0,-1.0,0.0,0.0,8.0


In [63]:
data[['BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6']].describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BILL_AMT1,30000.0,51223.3309,73635.8606,-165580.0,3558.75,22381.5,67091.0,964511.0
BILL_AMT2,30000.0,49179.0752,71173.7688,-69777.0,2984.75,21200.0,64006.25,983931.0
BILL_AMT3,30000.0,47013.1548,69349.3874,-157264.0,2666.25,20088.5,60164.75,1664089.0
BILL_AMT4,30000.0,43262.949,64332.8561,-170000.0,2326.75,19052.0,54506.0,891586.0
BILL_AMT5,30000.0,40311.401,60797.1558,-81334.0,1763.0,18104.5,50190.5,927171.0
BILL_AMT6,30000.0,38871.7604,59554.1075,-339603.0,1256.0,17071.0,49198.25,961664.0


In [64]:
data.isnull().sum()

ID           0
LIMIT_BAL    0
SEX          0
EDUCATION    0
MARRIAGE     0
AGE          0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
default      0
dtype: int64

#### There are no missing values in the dataset

#### Data Preprocessing

In [65]:
data.rename(columns = {'PAY_0':'PAY_1'}, inplace = True)
data.drop('ID', axis=1, inplace=True)

In [66]:
data.drop(data[((data["PAY_AMT1"] == 0) & (data["BILL_AMT1"] == 0)) |
               ((data["PAY_AMT2"] == 0) & (data["BILL_AMT2"] == 0)) |
               ((data["PAY_AMT3"] == 0) & (data["BILL_AMT3"] == 0)) |
               ((data["PAY_AMT4"] == 0) & (data["BILL_AMT4"] == 0)) |
               ((data["PAY_AMT5"] == 0) & (data["BILL_AMT5"] == 0)) |
               ((data["PAY_AMT6"] == 0) & (data["BILL_AMT6"] == 0))].index, inplace  = True)

In [67]:
data.AGE.min(), data.AGE.max()

(21, 79)

In [68]:
data['AGE']=pd.cut(data['AGE'],bins=range(20,85,5),labels=range(len(range(20,85,5))-1))
data['AGE'] = data['AGE'].astype('int')

#### we will create one single category of all low categories having less count

In [69]:
fill = (data.PAY_1 == 4) | (data.PAY_1==5) | (data.PAY_1==6) | (data.PAY_1==8) | (data.PAY_1==7)
data.loc[fill,'PAY_0']=4
fill = (data.PAY_2 == 4) | (data.PAY_2 == 1) | (data.PAY_2 == 5) | (data.PAY_2 == 7) | (data.PAY_2 == 6) | (data.PAY_2 == 8)
data.loc[fill,'PAY_2']=4
#df.PAY_2.value_counts()
fill = (data.PAY_3 == 4) | (data.PAY_3 == 1) | (data.PAY_3 == 5) | (data.PAY_3 == 7) | (data.PAY_3 == 6) | (data.PAY_3 == 8)
data.loc[fill,'PAY_3']=4
#df.PAY_3.value_counts()
fill = (data.PAY_4 == 4) | (data.PAY_4 == 1) | (data.PAY_4 == 5) | (data.PAY_4 == 7) | (data.PAY_4 == 6) | (data.PAY_4 == 8)
data.loc[fill,'PAY_4']=4
#df.PAY_4.value_counts()
fill = (data.PAY_5 == 4) | (data.PAY_5 == 7) | (data.PAY_5 == 5) | (data.PAY_5 == 6) | (data.PAY_5 == 8)
data.loc[fill,'PAY_5']=4
#df.PAY_5.value_counts()
fill = (data.PAY_6 == 4) | (data.PAY_6 == 7) | (data.PAY_6 == 5) | (data.PAY_6 == 6) | (data.PAY_6 == 8)
data.loc[fill,'PAY_6']=4
#df.PAY_6.value_counts()

In [70]:
data.columns = data.columns.map(str.lower)

In [71]:
new_data = pd.concat([data,pd.get_dummies(data[['education','sex','marriage']],drop_first=True).astype('int')], axis=1)
new_data.drop(['education','sex','marriage'], axis=1, inplace=True)
data_for_logic = data

In [72]:
col_to_norm = ['limit_bal', 'age', 'bill_amt1', 'bill_amt2', 'bill_amt3', 'bill_amt4', 'bill_amt5', 'bill_amt6', 'pay_amt1', 'pay_amt2', 'pay_amt3', 'pay_amt4', 'pay_amt5', 'pay_amt6']
sc = StandardScaler()

#### Split the data into train and test

In [73]:
inputs = new_data.drop('default', axis=1)
output = new_data['default']

X_train, X_test, y_train, y_test = train_test_split(inputs, output, test_size=0.3, random_state=42)

In [74]:
data.select_dtypes(include='object').columns

Index(['sex', 'education', 'marriage'], dtype='object')

In [75]:
inputs_cat = data.drop('default', axis=1)
outputs_cat = data['default']

columns_to_fill = ['sex', 'education', 'marriage']

inputs_cat[columns_to_fill] = inputs_cat[columns_to_fill].fillna('Missing Value')

X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(inputs_cat, outputs_cat, test_size=0.3, random_state=42)

In [76]:
def outlier_treatment(df):
    # Copy the input data to avoid modifying the original DataFrame

    # Calculate the first quartile (Q1) and third quartile (Q3) for each column
    q1 = df.quantile(0.25,numeric_only=True)
    q3 = df.quantile(0.75,numeric_only=True)

    # Calculate the interquartile range (IQR)
    iqr = q3 - q1

    # Define the lower and upper bounds for outlier detection
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Replace outliers with the lower or upper bound
    for column in df.columns:
        if np.issubdtype(df[column].dtype, np.number):  # Check if the column contains numerical data
            df[column] = np.where(df[column] < lower_bound[column], lower_bound[column], df[column])
            df[column] = np.where(df[column] > upper_bound[column], upper_bound[column], df[column])

    return df

In [77]:
def filling_missing_value(data):
    for i in data.columns:
        if data[i].dtypes == object:
            data[i].fillna(data[i].mode()[0], inplace=True)
        else:
            data[i].fillna(data[i].mean(), inplace=True)
    return data

#### Check the distribution of the data

In [78]:
data_for_logic[col_to_norm] = outlier_treatment(data_for_logic[col_to_norm])
data_for_logic = filling_missing_value(data_for_logic)

from scipy import stats


for i in data_for_logic.columns:

    if data_for_logic[i].dtype in ['int64', 'float64']:

        kstest_statistic, kstest_p_value = stats.kstest(data_for_logic[i], 'norm')

        # 'norm' inside indicates that the test is comparing the distribution of the data against a normal distribution

        print(f'Column: {i}')
        print(f'Kolmogorov-Smirnov Test:')
        print(f'Test Statistic: {kstest_statistic}')
        print(f'p-value: {kstest_p_value}')

        if kstest_p_value > 0.05:
            print('Data looks normally distributed')
            print()
        else:
            print('Data does not look normally distributed')
            print()

Column: limit_bal
Kolmogorov-Smirnov Test:
Test Statistic: 1.0
p-value: 0.0
Data does not look normally distributed

Column: age
Kolmogorov-Smirnov Test:
Test Statistic: 0.7098082518722022
p-value: 0.0
Data does not look normally distributed

Column: pay_1
Kolmogorov-Smirnov Test:
Test Statistic: 0.2902419830808578
p-value: 0.0
Data does not look normally distributed

Column: pay_2
Kolmogorov-Smirnov Test:
Test Statistic: 0.3376155813495967
p-value: 0.0
Data does not look normally distributed

Column: pay_3
Kolmogorov-Smirnov Test:
Test Statistic: 0.3445799724572103
p-value: 0.0
Data does not look normally distributed

Column: pay_4
Kolmogorov-Smirnov Test:
Test Statistic: 0.3673617942160141
p-value: 0.0
Data does not look normally distributed

Column: pay_5
Kolmogorov-Smirnov Test:
Test Statistic: 0.3860121975211489
p-value: 0.0
Data does not look normally distributed

Column: pay_6
Kolmogorov-Smirnov Test:
Test Statistic: 0.38105449537674596
p-value: 0.0
Data does not look normally d

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = np.where(df[column] < lower_bound[column], lower_bound[column], df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = np.where(df[column] > upper_bound[column], upper_bound[column], df[column])


In [79]:
def target_correlation(data_for_logic, target, threshold=0.1):
    corr_matrix = data_for_logic.corr(method='spearman', numeric_only=True)

    target_correlations = corr_matrix[target].abs() > threshold

    variables_explaining_target = target_correlations[target_correlations].index.tolist()

    variables_explaining_target.remove(target)

    result_explaining_target = pd.DataFrame(
        {'Variable': variables_explaining_target,
         'Correlation with Target': corr_matrix.loc[variables_explaining_target, target]})

    result_explaining_target.reset_index(drop=True, inplace=True)

    return result_explaining_target


In [80]:
target_result = target_correlation(data_for_logic, 'default')

target_result

Unnamed: 0,Variable,Correlation with Target
0,limit_bal,-0.1967
1,pay_1,0.3232
2,pay_2,0.2747
3,pay_3,0.253
4,pay_4,0.2342
5,pay_5,0.225
6,pay_6,0.2088
7,pay_amt1,-0.1722
8,pay_amt2,-0.1677
9,pay_amt3,-0.1523


In [81]:
def intercorrelation(data, target, threshold=0.8):
    corr_matrix = data.drop(columns=[target]).corr(method='spearman', numeric_only=True)

    highly_correlated_variables = (np.abs(corr_matrix) > threshold) & (corr_matrix != 1.0)

    print(highly_correlated_variables)

    independent_variable_pairs = np.where(highly_correlated_variables)

    print(independent_variable_pairs)

    result_intercorrelated_independent = pd.DataFrame({
        'Variable 1': corr_matrix.index[independent_variable_pairs[0]],
        'Variable 2': corr_matrix.columns[independent_variable_pairs[1]]
    })
    return result_intercorrelated_independent


In [82]:
intercorrelated_result = intercorrelation(data_for_logic, 'default')

intercorrelated_result

           limit_bal    age  pay_1  pay_2  pay_3  pay_4  pay_5  pay_6  \
limit_bal      False  False  False  False  False  False  False  False   
age            False  False  False  False  False  False  False  False   
pay_1          False  False  False  False  False  False  False  False   
pay_2          False  False  False  False  False  False  False  False   
pay_3          False  False  False  False  False  False  False  False   
pay_4          False  False  False  False  False  False  False  False   
pay_5          False  False  False  False  False  False  False  False   
pay_6          False  False  False  False  False  False  False  False   
bill_amt1      False  False  False  False  False  False  False  False   
bill_amt2      False  False  False  False  False  False  False  False   
bill_amt3      False  False  False  False  False  False  False  False   
bill_amt4      False  False  False  False  False  False  False  False   
bill_amt5      False  False  False  False  False  F

Unnamed: 0,Variable 1,Variable 2
0,bill_amt1,bill_amt2
1,bill_amt1,bill_amt3
2,bill_amt1,bill_amt4
3,bill_amt1,bill_amt5
4,bill_amt2,bill_amt1
5,bill_amt2,bill_amt3
6,bill_amt2,bill_amt4
7,bill_amt2,bill_amt5
8,bill_amt3,bill_amt1
9,bill_amt3,bill_amt2


In [83]:
target_result.Variable.tolist()

['limit_bal',
 'pay_1',
 'pay_2',
 'pay_3',
 'pay_4',
 'pay_5',
 'pay_6',
 'pay_amt1',
 'pay_amt2',
 'pay_amt3',
 'pay_amt4',
 'pay_amt5',
 'pay_amt6']

#### Multicollinearity between independent features

In [84]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(data, variables):
    vif_data = data[variables]
    vif_values = [variance_inflation_factor(vif_data.values, i) for i in range(vif_data.shape[1])]
    vif_result = pd.DataFrame({'VIF': vif_values, 'Features': variables})
    return vif_result


vif_result = calculate_vif(data_for_logic, ['limit_bal',
                                            'pay_1',
                                            'pay_2',
                                            'pay_3',
                                            'pay_4',
                                            'pay_5',
                                            'pay_6',
                                            'pay_amt1',
                                            'pay_amt2',
                                            'pay_amt3',
                                            'pay_amt4',
                                            'pay_amt5',
                                            'pay_amt6'])

vif_result


Unnamed: 0,VIF,Features
0,3.4142,limit_bal
1,2.4146,pay_1
2,3.394,pay_2
3,3.7969,pay_3
4,4.1899,pay_4
5,4.3703,pay_5
6,3.0262,pay_6
7,4.1143,pay_amt1
8,4.2513,pay_amt2
9,3.9899,pay_amt3


#### Split the data into train and test for Logistic Regression

In [85]:
data_for_logic = pd.concat([data_for_logic[vif_result.Features.tolist()],data_for_logic.default,pd.get_dummies(data[columns_to_fill],drop_first=True).astype(int)],axis=1)
X_train_logic, X_test_logic, y_train_logic, y_test_logic = train_test_split(data_for_logic.drop('default', axis=1), data_for_logic['default'], test_size=0.3, random_state=42)

#### Model Building

In [86]:
models = []

xgb_model_def = XGBClassifier()
lgb_model_def = LGBMClassifier()
catboost_model_def = CatBoostClassifier()
catboost_model_custom = CatBoostClassifier(cat_features=columns_to_fill)
random_forest_def = RandomForestClassifier()
logistic_regression_def = LogisticRegression()
svc_def = SVC(probability=True)
models.extend([
    ('XGBoost', xgb_model_def),
    ('LightGBM', lgb_model_def),
    ('CatBoost', catboost_model_def),
    ('CatBoost_Custom', catboost_model_custom),
    ('Random Forest', random_forest_def),
    ('Logistic Regression', logistic_regression_def),
    ('SVC', svc_def)
])

In [87]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

def train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    roc_prob = roc_auc_score(y_test, y_prob)
    gini_prob = roc_prob * 2 - 1
    confusion_matrix_result = confusion_matrix(y_test, y_pred)
    classification_report_result = classification_report(y_test, y_pred)

    print(f'Model Performance for {model_name}')
    print('Gini prob is', gini_prob * 100)
    print(classification_report_result)
    print(confusion_matrix_result)

    return gini_prob

#### Train and Evaluate the models

In [88]:
gini_df = pd.DataFrame(columns=['Model', 'Gini_prob'])
for model_name, model in models:
    if model_name == 'CatBoost_Custom':
        gini_prob = train_and_evaluate_model(model_name, model, X_train_cat, y_train_cat, X_test_cat, y_test_cat)
    elif model_name == 'Random Forest':
        X_train_filled = X_train.copy()
        X_test_filled = X_test.copy()
        X_train_filled = filling_missing_value(X_train_filled)
        X_test_filled = filling_missing_value(X_test_filled)
            
        gini_prob = train_and_evaluate_model(model_name, model, X_train_filled, y_train, X_test_filled, y_test)
    elif model_name == 'SVC':
        X_train_filled_svc = X_train.copy()
        X_test_filled_svc = X_test.copy()
        X_train_filled_svc = filling_missing_value(X_train_filled_svc)
        X_test_filled_svc = filling_missing_value(X_test_filled_svc)
        X_train_filled_svc = outlier_treatment(X_train_filled_svc)
        X_test_filled_svc = outlier_treatment(X_test_filled_svc)
        X_train_filled_svc[col_to_norm] = sc.fit_transform(X_train_filled_svc[col_to_norm])
        X_test_filled_svc[col_to_norm] = sc.fit_transform(X_test_filled_svc[col_to_norm])
        gini_prob = train_and_evaluate_model(model_name, model, X_train_filled_svc, y_train, X_test_filled_svc, y_test)
    elif model_name == 'Logistic Regression':
        X_train_filled_logistic = X_train_logic.copy()
        X_test_filled_logistic = X_test_logic.copy()
        X_train_filled_logistic = sc.fit_transform(X_train_filled_logistic)
        X_test_filled_logistic = sc.fit_transform(X_test_filled_logistic)
        gini_prob = train_and_evaluate_model(model_name, model, X_train_filled_logistic, y_train_logic, X_test_filled_logistic, y_test_logic)
    else:
        gini_prob = train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test)

    if gini_prob is not None:
        gini_df = pd.concat([gini_df, pd.DataFrame({'Model': [model_name], 'Gini_prob': [gini_prob]})], ignore_index=True)

gini_df_sorted = gini_df.sort_values(by='Gini_prob', ascending=False)

gini_df_sorted


Model Performance for XGBoost
Gini prob is 56.24143315808592
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      5987
           1       0.63      0.40      0.49      1638

    accuracy                           0.82      7625
   macro avg       0.74      0.67      0.69      7625
weighted avg       0.80      0.82      0.80      7625

[[5605  382]
 [ 987  651]]
[LightGBM] [Info] Number of positive: 3934, number of negative: 13856
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001858 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3200
[LightGBM] [Info] Number of data points in the train set: 17790, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221135 -> initscore=-1.259062
[LightGBM] [Info] Start training from score -1.259062
Model Performance for LightG

Unnamed: 0,Model,Gini_prob
3,CatBoost_Custom,0.5934
1,LightGBM,0.5928
2,CatBoost,0.5897
0,XGBoost,0.5624
4,Random Forest,0.5587
5,Logistic Regression,0.4919
6,SVC,0.4661


#### Hyperparameter Tuning

In [32]:
from sklearn.model_selection import cross_val_score
import optuna

def best_params_for_model(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100)
    }

    lgb_clf = LGBMClassifier(**param)

    auc = cross_val_score(lgb_clf, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1).mean()
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=10)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

best_lgb_model = LGBMClassifier(**best_params)

[I 2024-05-19 23:15:56,971] A new study created in memory with name: no-name-4254900a-7abc-4a6c-ab6b-fd89a07de055
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
[I 2024-05-19 23:16:06,489] Trial 0 finished with value: 0.784342340076879 and parameters: {'n_estimators': 22, 'learning_rate': 0.1453451805676855, 'max_depth': 7, 'num_leaves': 82}. Best is trial 0 with value: 0.784342340076879.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
[I 2024-05-19 23:16:17,142] Trial 1 finished with value: 0.7798793207841156 and parameters: {'n_estimators': 673, 'learning_rate': 0.032615639839094296, 'max_depth': 6, 'num_leaves': 25}. Best is trial 0 with value: 0.784342340076879.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
[I 2024-05-19 23:16:25,025] Trial 2 finished with value: 0.7913944264228915 and parameters: {'n_estimators': 189, 'learning_rate': 0.022499156742145074, 'max_depth': 8, 'num_leaves': 18}. Best is trial

Best trial:
  Value: 0.791
  Params:  {'n_estimators': 189, 'learning_rate': 0.022499156742145074, 'max_depth': 8, 'num_leaves': 18}


In [33]:
def best_params_for_model(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5,1), #  fraction of the training data used to train each tree.
        # Using a fraction of the data can help to prevent overfitting and improve generalization.
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5,1),
        'gamma': trial.suggest_int('gamma', 0,5)
        # Increasing the gamma value can help to prevent overfitting
        # Should be used only when you are using high depth
    }
    xgb_clf = XGBClassifier(**param)

    auc = cross_val_score(xgb_clf, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1).mean()
    return auc



study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=10)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

best_xgb_model = XGBClassifier(**best_params)

[I 2024-05-19 23:17:05,275] A new study created in memory with name: no-name-d60aca35-8633-4428-a6a5-2464c09aaa08
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5,1), #  fraction of the training data used to train each tree.
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5,1),
[I 2024-05-19 23:17:06,617] Trial 0 finished with value: 0.7908301298617391 and parameters: {'n_estimators': 96, 'learning_rate': 0.021247517185202472, 'max_depth': 7, 'subsample': 0.7154570657307004, 'colsample_bytree': 0.868192870576384, 'gamma': 1}. Best is trial 0 with value: 0.7908301298617391.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5,1), #  fraction of the training data used to train each tree.
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5,1),
[I 2024-05-19 23:17:07,998] Trial 1 finished with value: 0.7810

Best trial:
  Value: 0.793
  Params:  {'n_estimators': 400, 'learning_rate': 0.01678538001667352, 'max_depth': 5, 'subsample': 0.5680799199001354, 'colsample_bytree': 0.571860438025061, 'gamma': 3}


In [34]:
def best_params_for_model(trial):

    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10), # Regularization term that prevents overfitting by penalizing large parameter values.
        # 'cat_features': [],  # Handle categorical features separately
        'loss_function': trial.suggest_categorical('loss_function', ['Logloss']) # For regression tasks, use ‘RMSE,’ while for classification, use ‘Logloss’.
    }

    cb_clf = CatBoostClassifier(**param)

    auc = cross_val_score(cb_clf, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1).mean()
    return auc



study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=1)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

best_cb_model = CatBoostClassifier(**best_params)

[I 2024-05-19 23:17:22,373] A new study created in memory with name: no-name-d8db2a53-a9ba-48eb-812c-4b0361ac0d4a
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10), # Regularization term that prevents overfitting by penalizing large parameter values.
[I 2024-05-19 23:17:27,798] Trial 0 finished with value: 0.789010805493854 and parameters: {'iterations': 217, 'learning_rate': 0.12594219326590383, 'depth': 4, 'l2_leaf_reg': 1.9277705964700702, 'loss_function': 'Logloss'}. Best is trial 0 with value: 0.789010805493854.


Best trial:
  Value: 0.789
  Params:  {'iterations': 217, 'learning_rate': 0.12594219326590383, 'depth': 4, 'l2_leaf_reg': 1.9277705964700702, 'loss_function': 'Logloss'}


In [35]:
def best_params_for_model(trial):

    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10), # Regularization term that prevents overfitting by penalizing large parameter values.
        'cat_features': columns_to_fill,  # Handle categorical features separately
        'loss_function': trial.suggest_categorical('loss_function', ['Logloss']) # For regression tasks, use ‘RMSE,’ while for classification, use ‘Logloss’.
    }

    cb_clf = CatBoostClassifier(**param)

    auc = cross_val_score(cb_clf, X_train_cat, y_train_cat, cv=5, scoring='roc_auc', n_jobs=-1).mean()
    return auc



study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=1)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

best_cb_custom_model = CatBoostClassifier(**best_params,cat_features=columns_to_fill)

[I 2024-05-19 23:17:27,807] A new study created in memory with name: no-name-b3132727-8c4b-4005-871b-ac3215100a4d
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10), # Regularization term that prevents overfitting by penalizing large parameter values.
[I 2024-05-19 23:18:36,006] Trial 0 finished with value: 0.7935299083598109 and parameters: {'iterations': 648, 'learning_rate': 0.01450600477517221, 'depth': 7, 'l2_leaf_reg': 6.148612001654247, 'loss_function': 'Logloss'}. Best is trial 0 with value: 0.7935299083598109.


Best trial:
  Value: 0.794
  Params:  {'iterations': 648, 'learning_rate': 0.01450600477517221, 'depth': 7, 'l2_leaf_reg': 6.148612001654247, 'loss_function': 'Logloss'}


In [36]:
from sklearn.ensemble import RandomForestClassifier

def best_params_for_model(trial):

    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
    }

    rf_clf = RandomForestClassifier(**param)

    auc = cross_val_score(rf_clf, X_train_filled, y_train, cv=5, scoring='roc_auc', n_jobs=-1).mean()
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=10)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

best_rf_model = RandomForestClassifier(**best_params)


[I 2024-05-19 23:18:36,017] A new study created in memory with name: no-name-265eb981-0206-47e2-8c55-40e6b8762455
[I 2024-05-19 23:18:49,129] Trial 0 finished with value: 0.791636800483243 and parameters: {'n_estimators': 338, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 9, 'max_features': 'log2'}. Best is trial 0 with value: 0.791636800483243.
[I 2024-05-19 23:19:00,900] Trial 1 finished with value: 0.7871423463792844 and parameters: {'n_estimators': 402, 'max_depth': 5, 'min_samples_split': 14, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.791636800483243.
[I 2024-05-19 23:19:08,352] Trial 2 finished with value: 0.7886339914937106 and parameters: {'n_estimators': 186, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.791636800483243.
[I 2024-05-19 23:19:41,108] Trial 3 finished with value: 0.7895282737718501 and parameters: {'n_estimators': 826, 'max_depth': 7, 'min_sam

Best trial:
  Value: 0.792
  Params:  {'n_estimators': 338, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 9, 'max_features': 'log2'}


In [37]:
from sklearn.linear_model import LogisticRegression

def best_params_for_model(trial):

    param = {
        'C': trial.suggest_loguniform('C', 0.01, 10),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
        'max_iter': trial.suggest_int('max_iter', 100, 1000),
    }

    lr_clf = LogisticRegression(**param)

    auc = cross_val_score(lr_clf, X_train_filled_logistic, y_train, cv=5, scoring='roc_auc', n_jobs=-1).mean()
    return auc

study_lr = optuna.create_study(direction='maximize')
study_lr.optimize(best_params_for_model, n_trials=10)

print('Best trial:')
best_params_lr = study_lr.best_params
print('  Value: {:.3f}'.format(study_lr.best_value))
print('  Params: ', best_params_lr)

best_lr_model = LogisticRegression(**best_params_lr)


[I 2024-05-19 23:21:32,559] A new study created in memory with name: no-name-e87d3bb5-81f0-44e3-bf61-f5d0fc97b3a4
  'C': trial.suggest_loguniform('C', 0.01, 10),
[I 2024-05-19 23:21:33,239] Trial 0 finished with value: 0.7504029670231855 and parameters: {'C': 0.6476400005165418, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 192}. Best is trial 0 with value: 0.7504029670231855.
  'C': trial.suggest_loguniform('C', 0.01, 10),
[I 2024-05-19 23:21:33,454] Trial 1 finished with value: 0.7504023307317867 and parameters: {'C': 0.5409731787251858, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 967}. Best is trial 0 with value: 0.7504029670231855.
  'C': trial.suggest_loguniform('C', 0.01, 10),
[I 2024-05-19 23:21:34,633] Trial 2 finished with value: 0.7504099157099058 and parameters: {'C': 3.7401400156016407, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 210}. Best is trial 2 with value: 0.7504099157099058.
  'C': trial.suggest_loguniform('C', 0.01, 10),
[I 2024-05-19 23:21:34,839] Tria

Best trial:
  Value: 0.751
  Params:  {'C': 0.05160227718742661, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 429}


In [38]:

def best_params_for_model(trial):
    param = {
        'C': trial.suggest_loguniform('C', 0.1, 10),  # Regularization parameter
        'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),  # Kernel type
        'degree': trial.suggest_int('degree', 2, 5),  # Degree for polynomial kernel (only for 'poly' kernel)
        'gamma': trial.suggest_categorical('gamma', ['auto', 'scale']),  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    }

    svc = SVC(**param)


    # Perform cross-validation and calculate mean score
    scores = cross_val_score(svc, X_train_filled_svc, y_train, cv=3, scoring='accuracy', n_jobs=-1)

    # Calculate the mean score
    score = scores.mean()

    return score

study = optuna.create_study(direction='maximize')

study.optimize(best_params_for_model, n_trials=1)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)
best_svc_model = SVC(**best_params,probability=True)

[I 2024-05-19 23:21:37,189] A new study created in memory with name: no-name-8991b5a2-5a48-487a-ac01-9f8b26f2142a
  'C': trial.suggest_loguniform('C', 0.1, 10),  # Regularization parameter
[I 2024-05-19 23:21:45,390] Trial 0 finished with value: 0.8241146711635751 and parameters: {'C': 0.7967498577182166, 'kernel': 'rbf', 'degree': 2, 'gamma': 'auto'}. Best is trial 0 with value: 0.8241146711635751.


Best trial:
  Value: 0.824
  Params:  {'C': 0.7967498577182166, 'kernel': 'rbf', 'degree': 2, 'gamma': 'auto'}


#### Model Building with optimized hyperparameters

In [39]:
models_optimized = []

models_optimized.extend([
    ('XGBoost Optuna', best_xgb_model),
    ('LightGBM Optuna', best_lgb_model),
    ('CatBoost Optuna', best_cb_model),
    ('RandomForest Optuna', best_rf_model),
    ('Logistic Regression Optuna', best_lr_model),
    ('CatBoost_Custom Optuna', best_cb_custom_model),
    ('SVC Optuna', best_svc_model)
])

#### Train and Evaluate the models with optimized hyperparameters

In [40]:
gini_df_optuna = pd.DataFrame(columns=['Model', 'Gini_prob'])

for model_name, model in models_optimized:
    if model_name == 'CatBoost_Custom Optuna':
        gini_prob = train_and_evaluate_model(model_name, model, X_train_cat, y_train_cat, X_test_cat, y_test_cat)
    elif model_name == 'RandomForest Optuna':
        gini_prob = train_and_evaluate_model(model_name, model, X_train_filled, y_train, X_test_filled, y_test)
    elif model_name == 'SVC Optuna':
        gini_prob = train_and_evaluate_model(model_name, model, X_train_filled_svc, y_train,X_test_filled_svc, y_test)
    elif model_name == 'Logistic Regression Optuna':
        gini_prob = train_and_evaluate_model(model_name, model, X_train_filled_logistic, y_train_logic, X_test_filled_logistic, y_test_logic)
    else:
        gini_prob = train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test)

    if gini_prob is not None:
        gini_df_optuna = pd.concat([gini_df_optuna, pd.DataFrame({'Model': [model_name], 'Gini_prob': [gini_prob]})], ignore_index=True)

gini_df_sorted_optuna = gini_df_optuna.sort_values(by='Gini_prob', ascending=False)

Model Performance for XGBoost Optuna
Gini prob is 60.069364779570236
              precision    recall  f1-score   support

           0       0.85      0.95      0.90      5987
           1       0.67      0.40      0.50      1638

    accuracy                           0.83      7625
   macro avg       0.76      0.67      0.70      7625
weighted avg       0.81      0.83      0.81      7625

[[5663  324]
 [ 988  650]]
[LightGBM] [Info] Number of positive: 3934, number of negative: 13856
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000786 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3200
[LightGBM] [Info] Number of data points in the train set: 17790, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221135 -> initscore=-1.259062
[LightGBM] [Info] Start training from score -1.259062
Model Performance fo

In [41]:
gini_df_sorted_optuna

Unnamed: 0,Model,Gini_prob
0,XGBoost Optuna,0.600694
1,LightGBM Optuna,0.598692
5,CatBoost_Custom Optuna,0.598085
2,CatBoost Optuna,0.594451
3,RandomForest Optuna,0.592812
4,Logistic Regression Optuna,0.492055
6,SVC Optuna,0.47072


#### Final Review

In [47]:
final_review = pd.concat([gini_df_sorted,gini_df_sorted_optuna], axis=0)

final_review_sorted = final_review.sort_values(by='Gini_prob', ascending=False)
final_review_sorted.reset_index(drop=True, inplace=True)

final_review_sorted

Unnamed: 0,Model,Gini_prob
0,XGBoost Optuna,0.600694
1,LightGBM Optuna,0.598692
2,CatBoost_Custom Optuna,0.598085
3,CatBoost Optuna,0.594451
4,CatBoost_Custom,0.593442
5,RandomForest Optuna,0.592812
6,LightGBM,0.592773
7,CatBoost,0.589693
8,Random Forest,0.564572
9,XGBoost,0.562414


#### Stacking Classifier

In [48]:
from mlxtend.classifier import StackingCVClassifier

In [49]:
# Define the base classifiers
clf1 = best_cb_model
clf2 = best_lgb_model
meta_classifier = best_xgb_model

In [50]:
# Define the stacking classifier
stacking_classifier = StackingCVClassifier(classifiers=[clf1, clf2],
                                           meta_classifier=meta_classifier,
                                           cv=5,
                                           use_probas=True,
                                           use_features_in_secondary=True,
                                           verbose=1,
                                           random_state=42)

In [51]:
train_and_evaluate_model('Stacking Classifier with XGB, Catboost and LGBM', stacking_classifier, X_train, y_train, X_test, y_test)

Fitting 2 classifiers...
Fitting classifier1: catboostclassifier (1/2)
0:	learn: 0.6238860	total: 5.89ms	remaining: 1.27s
1:	learn: 0.5742419	total: 11.3ms	remaining: 1.21s
2:	learn: 0.5355114	total: 17ms	remaining: 1.21s
3:	learn: 0.5068476	total: 22.8ms	remaining: 1.21s
4:	learn: 0.4853570	total: 28.2ms	remaining: 1.19s
5:	learn: 0.4697071	total: 34ms	remaining: 1.2s
6:	learn: 0.4577554	total: 39.5ms	remaining: 1.18s
7:	learn: 0.4498235	total: 44.7ms	remaining: 1.17s
8:	learn: 0.4431626	total: 50.5ms	remaining: 1.17s
9:	learn: 0.4372771	total: 56.2ms	remaining: 1.16s
10:	learn: 0.4328502	total: 62.1ms	remaining: 1.16s
11:	learn: 0.4297438	total: 68.5ms	remaining: 1.17s
12:	learn: 0.4274280	total: 74.2ms	remaining: 1.16s
13:	learn: 0.4251260	total: 80.9ms	remaining: 1.17s
14:	learn: 0.4228447	total: 86.7ms	remaining: 1.17s
15:	learn: 0.4213762	total: 93ms	remaining: 1.17s
16:	learn: 0.4203488	total: 99.2ms	remaining: 1.17s
17:	learn: 0.4191557	total: 105ms	remaining: 1.17s
18:	learn: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


27:	learn: 0.4133492	total: 167ms	remaining: 1.13s
28:	learn: 0.4130071	total: 174ms	remaining: 1.12s
29:	learn: 0.4126353	total: 180ms	remaining: 1.12s
30:	learn: 0.4121746	total: 186ms	remaining: 1.11s
31:	learn: 0.4116710	total: 192ms	remaining: 1.11s
32:	learn: 0.4111273	total: 198ms	remaining: 1.1s
33:	learn: 0.4107397	total: 203ms	remaining: 1.09s
34:	learn: 0.4104351	total: 209ms	remaining: 1.08s
35:	learn: 0.4100742	total: 215ms	remaining: 1.08s
36:	learn: 0.4097791	total: 221ms	remaining: 1.07s
37:	learn: 0.4095335	total: 226ms	remaining: 1.07s
38:	learn: 0.4092936	total: 232ms	remaining: 1.06s
39:	learn: 0.4090615	total: 237ms	remaining: 1.05s
40:	learn: 0.4087097	total: 243ms	remaining: 1.04s
41:	learn: 0.4084188	total: 250ms	remaining: 1.04s
42:	learn: 0.4081734	total: 256ms	remaining: 1.03s
43:	learn: 0.4078969	total: 262ms	remaining: 1.03s
44:	learn: 0.4076323	total: 267ms	remaining: 1.02s
45:	learn: 0.4073600	total: 273ms	remaining: 1.01s
46:	learn: 0.4071271	total: 278m

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LightGBM] [Info] Number of positive: 3147, number of negative: 11085
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000657 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3199
[LightGBM] [Info] Number of data points in the train set: 14232, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221121 -> initscore=-1.259143
[LightGBM] [Info] Start training from score -1.259143
[LightGBM] [Info] Number of positive: 3147, number of negative: 11085
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000628 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3196
[LightGBM] [Info] Number of data points in the train set: 14232, number of used features: 27
[LightGBM] [Info] [b

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.9s finished


0:	learn: 0.6257638	total: 6.77ms	remaining: 1.46s
1:	learn: 0.5747842	total: 13.5ms	remaining: 1.46s
2:	learn: 0.5364825	total: 20ms	remaining: 1.43s
3:	learn: 0.5083675	total: 25.9ms	remaining: 1.38s
4:	learn: 0.4867649	total: 32.1ms	remaining: 1.36s
5:	learn: 0.4717576	total: 38.8ms	remaining: 1.36s
6:	learn: 0.4598082	total: 45.4ms	remaining: 1.36s
7:	learn: 0.4511609	total: 59.6ms	remaining: 1.55s
8:	learn: 0.4443435	total: 66.2ms	remaining: 1.53s
9:	learn: 0.4397137	total: 72.2ms	remaining: 1.49s
10:	learn: 0.4355341	total: 78.9ms	remaining: 1.48s
11:	learn: 0.4323333	total: 85.6ms	remaining: 1.46s
12:	learn: 0.4297360	total: 93ms	remaining: 1.46s
13:	learn: 0.4275826	total: 99.1ms	remaining: 1.44s
14:	learn: 0.4258624	total: 106ms	remaining: 1.43s
15:	learn: 0.4242976	total: 120ms	remaining: 1.5s
16:	learn: 0.4232051	total: 128ms	remaining: 1.5s
17:	learn: 0.4222965	total: 148ms	remaining: 1.63s
18:	learn: 0.4215961	total: 156ms	remaining: 1.63s
19:	learn: 0.4206333	total: 178ms

0.593355607887093

#### Univariate Feature Selection

In [52]:
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve, auc

variables = []
train_gini_scores = []
test_gini_scores = []


for i in X_train.columns:
    X_train_single_var = X_train[[i]]
    X_test_single_var = X_test[[i]]

    best_xgb_model.fit(X_train_single_var, y_train)
    y_pred_train_single_var = best_xgb_model.predict_proba(X_train_single_var)[:, 1]
    train_roc = roc_auc_score(y_train, y_pred_train_single_var)
    train_gini = 2 * train_roc - 1

    y_pred_test_single_var = best_xgb_model.predict_proba(X_test_single_var)[:, 1]
    test_roc = roc_auc_score(y_test, y_pred_test_single_var)
    test_gini = 2 * test_roc - 1

    variables.append(i)
    train_gini_scores.append(train_gini)
    test_gini_scores.append(test_gini)

results_df = pd.DataFrame({
    'Variable': variables,
    'Train Gini': train_gini_scores,
    'Test Gini': test_gini_scores
})

results_df_sorted = results_df.sort_values(by='Test Gini', ascending=False)

pd.options.display.float_format = '{:.4f}'.format

results_df_sorted


Unnamed: 0,Variable,Train Gini,Test Gini
2,pay_1,0.4611,0.4396
3,pay_2,0.3472,0.3342
4,pay_3,0.3175,0.3071
0,limit_bal,0.2822,0.2695
5,pay_4,0.2929,0.2665
6,pay_5,0.2744,0.2574
15,pay_amt2,0.2606,0.2547
7,pay_6,0.2567,0.2448
14,pay_amt1,0.2855,0.2425
16,pay_amt3,0.2516,0.2057


In [53]:
results_df[results_df['Test Gini']>0.2].Variable.tolist()

['limit_bal',
 'pay_1',
 'pay_2',
 'pay_3',
 'pay_4',
 'pay_5',
 'pay_6',
 'pay_amt1',
 'pay_amt2',
 'pay_amt3',
 'pay_amt4',
 'pay_amt6']

#### Train and Evaluate the models with selected features

In [54]:
fin_input = new_data[results_df[results_df['Test Gini']>0.2].Variable.tolist()]
fin_output =data['default']
X_train_uni, X_test_uni, y_train_uni, y_test_uni = train_test_split(fin_input, fin_output, test_size=0.3, random_state=42)
best_xgb_model_fin = best_xgb_model.fit(X_train_uni, y_train_uni)

#### Train and Evaluate the models with selected features

In [55]:
train_and_evaluate_model('XGB with Optuna for selected features', best_xgb_model_fin, X_train_uni, y_train_uni, X_test_uni, y_test_uni)

Model Performance for XGB with Optuna for selected features
Gini prob is 58.42677449492217
              precision    recall  f1-score   support

           0       0.85      0.95      0.90      5987
           1       0.67      0.40      0.50      1638

    accuracy                           0.83      7625
   macro avg       0.76      0.67      0.70      7625
weighted avg       0.81      0.83      0.81      7625

[[5670  317]
 [ 983  655]]


0.5842677449492217