# Model

In [1]:
!pip install imbalanced-learn==0.8.0



In [2]:
!pip install xgboost==1.4.0



In [3]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot
from matplotlib.pyplot import figure
%matplotlib inline
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, log_loss
from math import sqrt

from imblearn.over_sampling import SMOTE

## Settings

In [4]:
RAW_DATA_DIR = "./../data/raw"
INTERIM_DATA_DIR = "./../data/interim"
PROCESSED_DATA_DIR = "./../data/processed"
VERSION = "v1.0"
params = {'axes.titlesize':'32',
          'xtick.labelsize':'24',
          'ytick.labelsize':'24'}
matplotlib.rcParams.update(params)

## Load Processed Data

In [5]:
rmoutliers_scaled_df = pd.read_csv(f'{INTERIM_DATA_DIR}/Remove-Outliers-MinMaxScaled-v1.0.csv')
rmoutliers_df = pd.read_csv(f'{INTERIM_DATA_DIR}/Remove-Outliers-v1.0.csv')

## Train Test Split

In [6]:
rmoutliers_scaled_df.columns

Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

In [7]:
X = rmoutliers_scaled_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans']] # Based on analysis from recursive feature elimination and xgb feature importance
y = rmoutliers_scaled_df['SeriousDlqin2yrs']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train02, X_test02, y_train02, y_test02 = train_test_split(rmoutliers_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans']], rmoutliers_df[['SeriousDlqin2yrs']], test_size=0.2, random_state=123)

# define min max scaler
scaler = MinMaxScaler()
scaler_X = scaler.fit(X_train02)
scaler_y = scaler.fit(y_train02)

X_resampled, y_resampled = SMOTE(sampling_strategy='minority', random_state=123).fit_resample(X_train, y_train)

## Base Model with Kfolds

In [8]:
# eval_set = [(X_test, y_test)]
# model = XGBClassifier(use_label_encoder=False, seed=123, random_state=123)
# model.fit(X_train, y_train, eval_metric="auc", eval_set=eval_set, verbose=True)

# parameters = {
#     "objective": ["binary:logistic"],
#     "eval_metric": ["auc"],
#     "eta": [0.05, 0.075, 0.1, 0.15],
#     "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9],
#     "min_child_weight": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#     "subsample": [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#     "colsample_bytree": [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
# }

parameters = {
    "objective": ["binary:logistic"],
    "eval_metric": ["auc"],
}

kfold = KFold(n_splits=10, shuffle=True, random_state=123)
model = XGBClassifier(use_label_encoder=False, objective="binary:logistic", seed=123, random_state=123)
grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring='roc_auc', cv=kfold)
grid_search.fit(X_resampled, y_resampled)

GridSearchCV(cv=KFold(n_splits=10, random_state=123, shuffle=True),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=123,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, seed=123,
            

## Metrics

In [9]:
y_pred_prob = grid_search.predict_proba(X_test)

In [10]:
y_pred_prob

array([[0.9432499 , 0.05675013],
       [0.95315164, 0.04684836],
       [0.5890729 , 0.41092712],
       ...,
       [0.8063066 , 0.19369343],
       [0.884211  , 0.11578898],
       [0.89845365, 0.10154637]], dtype=float32)

In [11]:
print(f"AUROC: {roc_auc_score(y_test, y_pred_prob[:, 1])}")

AUROC: 0.7691119297718343


In [12]:
y_pred = grid_search.predict(X_test)

In [13]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96     20101
           1       0.34      0.11      0.16      1455

    accuracy                           0.93     21556
   macro avg       0.64      0.55      0.56     21556
weighted avg       0.90      0.93      0.91     21556



In [14]:
confusion_matrix(y_test, y_pred)

array([[19787,   314],
       [ 1296,   159]])

In [15]:
log_loss(y_test, y_pred)

2.5796845921970353

## Load Test Data

In [16]:
test_df = pd.read_csv(f'{RAW_DATA_DIR}/cs-test.csv')

In [17]:
test_df_X = test_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans']]

# define min max scaler
scaler02 = MinMaxScaler()
# transform data
scaled02 = scaler02.fit_transform(test_df_X)
print(scaled02[0])
scaled_test_df = pd.DataFrame(scaled02, columns=test_df_X.columns) 

test_X = scaled_test_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans']]

[4.05810494e-05 2.65060241e-01 6.61556156e-07 7.37673094e-04
 4.70588235e-02]


In [18]:
preds = grid_search.predict_proba(test_X)

In [19]:
preds

array([[0.0094381 , 0.9905619 ],
       [0.01278126, 0.98721874],
       [0.02058339, 0.9794166 ],
       ...,
       [0.01173472, 0.9882653 ],
       [0.00578803, 0.994212  ],
       [0.4058137 , 0.5941863 ]], dtype=float32)

## Export Results

In [20]:
CV_Type = "KFold"
submission = pd.read_csv(f"{RAW_DATA_DIR}/sampleEntry.csv")
submission["Probability"]=preds[:, 1]
submission.to_csv(f'{PROCESSED_DATA_DIR}/submission-{CV_Type}.csv', index=False)

## Export Model

In [21]:
from datetime import datetime
import joblib

DATE = datetime.today().strftime('%Y-%m-%d')
MODEL_DIR = "./../models"
ALGO = "XGB"
CLASSIFIER = "Binary-Classifier"
joblib.dump(grid_search, f"{MODEL_DIR}/{ALGO}-{CLASSIFIER}-{CV_Type}-{DATE}.pkl")

['./../models/XGB-Binary-Classifier-KFold-2021-04-18.pkl']