# Model

In [1]:
!pip install imbalanced-learn==0.8.0

Collecting imbalanced-learn==0.8.0
  Downloading imbalanced_learn-0.8.0-py3-none-any.whl (206 kB)
[K     |████████████████████████████████| 206 kB 15.6 MB/s eta 0:00:01
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.8.0


In [2]:
!pip install xgboost==1.4.0

Collecting xgboost==1.4.0
  Downloading xgboost-1.4.0-py3-none-manylinux2010_x86_64.whl (166.7 MB)
[K     |████████████████████████████████| 166.7 MB 60.4 MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.4.0


In [3]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot
from matplotlib.pyplot import figure
%matplotlib inline
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, LeaveOneOut, GridSearchCV
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score
from math import sqrt

from imblearn.over_sampling import SMOTE

## Settings

In [4]:
RAW_DATA_DIR = "./../data/raw"
INTERIM_DATA_DIR = "./../data/interim"
PROCESSED_DATA_DIR = "./../data/processed"
VERSION = "v1.0"
params = {'axes.titlesize':'32',
          'xtick.labelsize':'24',
          'ytick.labelsize':'24'}
matplotlib.rcParams.update(params)

## Load Processed Data

In [5]:
rmoutliers_scaled_df = pd.read_csv(f'{INTERIM_DATA_DIR}/Remove-Outliers-MinMaxScaled-v1.0.csv')
rmoutliers_df = pd.read_csv(f'{INTERIM_DATA_DIR}/Remove-Outliers-v1.0.csv')

## Train Test Split

In [6]:
rmoutliers_scaled_df.columns

Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

In [7]:
X = rmoutliers_scaled_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']].to_numpy() # Based on analysis from recursive feature elimination and xgb feature importance
y = rmoutliers_scaled_df['SeriousDlqin2yrs'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_resampled, y_resampled = SMOTE(sampling_strategy='minority', random_state=123).fit_resample(X_train, y_train)

## Base Model

In [None]:
# eval_set = [(X_test, y_test)]
# model = XGBClassifier(use_label_encoder=False, seed=123, random_state=123)
# model.fit(X_train, y_train, eval_metric="auc", eval_set=eval_set, verbose=False)

parameters = {
    "objective": ["binary:logistic"],
    "eval_metric": ["auc"],
}

loo = LeaveOneOut()
model = XGBClassifier(use_label_encoder=False, objective="binary:logistic", seed=123, random_state=123)
grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring='roc_auc', cv=loo)
grid_search.fit(X_resampled, y_resampled)

In [None]:
y_pred_prob = model.predict_proba(X_test)

In [None]:
y_pred_prob

In [None]:
print(f"AUROC: {roc_auc_score(y_test, y_pred_prob[:, 1]):.2f}%")

## Tuning

## Load Test Data

In [None]:
test_df = pd.read_csv(f'{RAW_DATA_DIR}/cs-test.csv')

In [None]:
test_df_X = test_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']]

# define min max scaler
scaler02 = MinMaxScaler()
# transform data
scaled02 = scaler02.fit_transform(test_df_X)
print(scaled02[0])
scaled_test_df = pd.DataFrame(scaled02, columns=test_df_X.columns) 

test_X = scaled_test_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']]

In [None]:
preds = automl.predict_proba(test_X)

In [None]:
preds

## Export Results

In [None]:
submission = pd.read_csv(f"{RAW_DATA_DIR}/sampleEntry.csv")
submission["Probability"]=preds[:, 1]
submission.to_csv(f'{PROCESSED_DATA_DIR}/submission.csv', index=False)