# Model

In [1]:
!pip install imbalanced-learn==0.8.0



In [2]:
!pip install xgboost==1.4.0



In [3]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot
from matplotlib.pyplot import figure
%matplotlib inline
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, log_loss
from math import sqrt

from imblearn.over_sampling import SMOTE

## Settings

In [4]:
RAW_DATA_DIR = "./../data/raw"
INTERIM_DATA_DIR = "./../data/interim"
PROCESSED_DATA_DIR = "./../data/processed"
VERSION = "v1.0"
params = {'axes.titlesize':'32',
          'xtick.labelsize':'24',
          'ytick.labelsize':'24'}
matplotlib.rcParams.update(params)

## Load Processed Data

In [5]:
rmoutliers_scaled_df = pd.read_csv(f'{INTERIM_DATA_DIR}/Remove-Outliers-MinMaxScaled-v1.0.csv')
rmoutliers_df = pd.read_csv(f'{INTERIM_DATA_DIR}/Remove-Outliers-v1.0.csv')

## Train Test Split

In [6]:
rmoutliers_scaled_df.columns

Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

In [7]:
X = rmoutliers_scaled_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans']] # Based on analysis from recursive feature elimination and xgb feature importance
y = rmoutliers_scaled_df['SeriousDlqin2yrs']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train02, X_test02, y_train02, y_test02 = train_test_split(rmoutliers_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans']], rmoutliers_df[['SeriousDlqin2yrs']], test_size=0.2, random_state=123)

# define min max scaler
scaler = MinMaxScaler()
scaler_X = scaler.fit(X_train02)
scaler_y = scaler.fit(y_train02)

X_resampled, y_resampled = SMOTE(sampling_strategy='minority', random_state=123).fit_resample(X_train, y_train)

## Base Model

In [8]:
eval_set = [(X_test, y_test)]
model = XGBClassifier(use_label_encoder=False, seed=123, random_state=123)
model.fit(X_train, y_train, eval_metric="auc", eval_set=eval_set, verbose=True)

[0]	validation_0-auc:0.77607
[1]	validation_0-auc:0.78101
[2]	validation_0-auc:0.78287
[3]	validation_0-auc:0.78371
[4]	validation_0-auc:0.78499
[5]	validation_0-auc:0.78446
[6]	validation_0-auc:0.78605
[7]	validation_0-auc:0.78653
[8]	validation_0-auc:0.78709
[9]	validation_0-auc:0.78821
[10]	validation_0-auc:0.78869
[11]	validation_0-auc:0.78930
[12]	validation_0-auc:0.78919
[13]	validation_0-auc:0.78937
[14]	validation_0-auc:0.78914
[15]	validation_0-auc:0.78868
[16]	validation_0-auc:0.78887
[17]	validation_0-auc:0.78907
[18]	validation_0-auc:0.78842
[19]	validation_0-auc:0.78811
[20]	validation_0-auc:0.78825
[21]	validation_0-auc:0.78817
[22]	validation_0-auc:0.78787
[23]	validation_0-auc:0.78811
[24]	validation_0-auc:0.78825
[25]	validation_0-auc:0.78801
[26]	validation_0-auc:0.78774
[27]	validation_0-auc:0.78772
[28]	validation_0-auc:0.78734
[29]	validation_0-auc:0.78735
[30]	validation_0-auc:0.78746
[31]	validation_0-auc:0.78719
[32]	validation_0-auc:0.78715
[33]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=123,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=123,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

## Metrics

In [9]:
y_pred_prob = model.predict_proba(X_test)



In [10]:
y_pred_prob

array([[0.9476655 , 0.0523345 ],
       [0.9874702 , 0.01252981],
       [0.6680068 , 0.33199325],
       ...,
       [0.89708745, 0.10291255],
       [0.9445667 , 0.05543329],
       [0.9443627 , 0.0556373 ]], dtype=float32)

In [11]:
print(f"AUROC: {roc_auc_score(y_test, y_pred_prob[:, 1])}%")

AUROC: 0.7814241345808479%


In [12]:
y_pred = model.predict(X_test)



In [13]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96     20101
           1       0.43      0.04      0.08      1455

    accuracy                           0.93     21556
   macro avg       0.68      0.52      0.52     21556
weighted avg       0.90      0.93      0.90     21556



In [14]:
confusion_matrix(y_test, y_pred)

array([[20019,    82],
       [ 1394,    61]])

In [15]:
log_loss(y_test, y_pred)

2.3649702878955954

## Load Test Data

In [16]:
test_df = pd.read_csv(f'{RAW_DATA_DIR}/cs-test.csv')

In [17]:
test_df_X = test_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans']]

# define min max scaler
scaler02 = MinMaxScaler()
# transform data
scaled02 = scaler02.fit_transform(test_df_X)
print(scaled02[0])
scaled_test_df = pd.DataFrame(scaled02, columns=test_df_X.columns) 

test_X = scaled_test_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans']]

[4.05810494e-05 2.65060241e-01 6.61556156e-07 7.37673094e-04
 4.70588235e-02]


In [18]:
preds = model.predict_proba(test_X)



In [19]:
preds

array([[0.97624254, 0.02375744],
       [0.99012035, 0.00987964],
       [0.98047036, 0.01952962],
       ...,
       [0.99654734, 0.00345268],
       [0.990558  , 0.00944199],
       [0.9782989 , 0.02170108]], dtype=float32)

## Export Results

In [20]:
CV_Type = "NormalTrainTestSplit"
submission = pd.read_csv(f"{RAW_DATA_DIR}/sampleEntry.csv")
submission["Probability"]=preds[:, 1]
submission.to_csv(f'{PROCESSED_DATA_DIR}/submission-{CV_Type}.csv', index=False)

## Export Model

In [21]:
from datetime import datetime

DATE = datetime.today().strftime('%Y-%m-%d')
MODEL_DIR = "./../models"
ALGO = "XGB"
CLASSIFIER = "Binary-Classifier"
model.save_model(f"{MODEL_DIR}/{ALGO}-{CLASSIFIER}-{CV_Type}-{DATE}.pkl")