# Model

In [1]:
!pip install imbalanced-learn==0.8.0



In [2]:
!pip install mljar-supervised==0.9.1



In [3]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot
from matplotlib.pyplot import figure
%matplotlib inline
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

from sklearn.metrics import roc_auc_score
from math import sqrt

from imblearn.over_sampling import SMOTE
from supervised import AutoML

RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

## Settings

In [4]:
RAW_DATA_DIR = "./../data/raw"
INTERIM_DATA_DIR = "./../data/interim"
PROCESSED_DATA_DIR = "./../data/processed"
VERSION = "v1.0"
params = {'axes.titlesize':'32',
          'xtick.labelsize':'24',
          'ytick.labelsize':'24'}
matplotlib.rcParams.update(params)

## Load Processed Data

In [5]:
rmoutliers_scaled_df = pd.read_csv(f'{INTERIM_DATA_DIR}/Remove-Outliers-MinMaxScaled-v1.0.csv')
rmoutliers_df = pd.read_csv(f'{INTERIM_DATA_DIR}/Remove-Outliers-v1.0.csv')

## Train Test Split

In [6]:
rmoutliers_scaled_df.columns

Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

In [7]:
X = rmoutliers_scaled_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']] # Based on analysis from recursive feature elimination and xgb feature importance
y = rmoutliers_scaled_df['SeriousDlqin2yrs']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train02, X_test02, y_train02, y_test02 = train_test_split(rmoutliers_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']], rmoutliers_df[['SeriousDlqin2yrs']], test_size=0.2, random_state=123)

# define min max scaler
scaler = MinMaxScaler()
scaler_X = scaler.fit(X_train02)
scaler_y = scaler.fit(y_train02)

X_resampled, y_resampled = SMOTE(sampling_strategy='minority', random_state=123).fit_resample(X_train, y_train)

## Base Model

In [8]:
automl = AutoML(mode="Perform", ml_task='binary_classification', eval_metric='auc', golden_features=False, train_ensemble=False, stack_models=False, random_state=123)
automl.fit(X_resampled, y_resampled)

Linear algorithm was disabled.
AutoML directory: AutoML_1
The task is binary_classification with evaluation metric auc
AutoML will use algorithms: ['Random Forest', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network']
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 5 models
1_Default_LightGBM auc 0.989564 trained in 401.16 seconds (1-sample predict time 0.0227 seconds)
2_Default_Xgboost auc 0.988365 trained in 126.8 seconds (1-sample predict time 0.0257 seconds)
3_Default_CatBoost auc 0.98904 trained in 165.64 seconds (1-sample predict time 0.024 seconds)
4_Default_NeuralNetwork auc 0.895085 trained in 63.28 seconds (1-sample predict time 0.0223 seconds)
5_Default_RandomForest auc 0.927282 trained in 79.04 seconds (1-sample predict time 0.0928 seconds)
* Step not_so_

AutoML(eval_metric='auc', golden_features=False,
       ml_task='binary_classification', mode='Perform', random_state=123,
       stack_models=False, train_ensemble=False)

In [22]:
y_pred_prob = automl.predict_proba(X_test)

In [23]:
y_pred_prob

array([[0.97816936, 0.02183064],
       [0.98913798, 0.01086202],
       [0.43847366, 0.56152634],
       ...,
       [0.86731137, 0.13268863],
       [0.87668756, 0.12331244],
       [0.96459646, 0.03540354]])

In [24]:
print(f"AUROC: {roc_auc_score(y_test, y_pred_prob[:, 1]):.2f}%")

AUROC: 0.85%


## Load Test Data

In [25]:
test_df = pd.read_csv(f'{RAW_DATA_DIR}/cs-test.csv')

In [26]:
test_df_X = test_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']]

# define min max scaler
scaler02 = MinMaxScaler()
# transform data
scaled02 = scaler02.fit_transform(test_df_X)
print(scaled02[0])
scaled_test_df = pd.DataFrame(scaled02, columns=test_df_X.columns) 

test_X = scaled_test_df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']]

[4.05810494e-05 2.65060241e-01 0.00000000e+00 6.61556156e-07
 7.37673094e-04 4.70588235e-02 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00]


In [27]:
preds = automl.predict_proba(test_X)

In [28]:
preds

array([[3.64065736e-01, 6.35934264e-01],
       [1.11715014e-03, 9.98882850e-01],
       [7.07595741e-05, 9.99929240e-01],
       ...,
       [3.32360446e-01, 6.67639554e-01],
       [8.58115854e-02, 9.14188415e-01],
       [3.29474103e-01, 6.70525897e-01]])

## Export Results

In [29]:
submission = pd.read_csv(f"{RAW_DATA_DIR}/sampleEntry.csv")
submission["Probability"]=preds[:, 1]
submission.to_csv(f'{PROCESSED_DATA_DIR}/submission.csv', index=False)