In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,confusion_matrix,accuracy_score,recall_score,precision_score,classification_report,roc_auc_score
import shap
import catboost

In [3]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('ggplot')

# to visualize all the columns 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder

In [6]:
cat_col_final = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE']
num_col_final = ['TARGET', 'EXT_SOURCE_2','EXT_SOURCE_3', 'DAYS_REGISTRATION', 'DAYS_LAST_PHONE_CHANGE']

In [7]:
#. load the training data ## Pipeline for Data Processing
df = pd.read_csv('/Users/zio/PycharmProjects/HomeLoanDefaultDetector/data_new.csv')
df.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,TARGET,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_REGISTRATION,DAYS_LAST_PHONE_CHANGE
0,Cash loans,F,N,Commercial associate,Secondary / secondary special,0,0.555561,0.514747,-2671.0,-1482.0
1,Revolving loans,M,Y,Working,Secondary / secondary special,0,0.577857,0.3808,-2382.0,-1109.0
2,Cash loans,F,N,State servant,Secondary / secondary special,0,0.749873,0.542445,-8297.0,-2255.0
3,Cash loans,F,N,Pensioner,Secondary / secondary special,0,0.680649,0.42413,-255.0,-192.0
4,Cash loans,F,N,Working,Higher education,0,0.526164,0.597192,-8665.0,-1209.0


In [8]:
y = df['TARGET']
X = df[num_col_final + cat_col_final].drop('TARGET', axis = 1)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    stratify = y,
                                                    random_state = 42
                                                   )

In [10]:
params = {
          'learning_rate':0.1,
          'cat_features':cat_col_final,
          'depth':6,
          'eval_metric':'AUC',
          'verbose':200,
          'od_type':"Iter", # overfit detector
          'od_wait':500, # most recent best iteration to wait before stopping
          #'random_seed': 2,
          #'n_estimators': 200
          }


cat_model = catboost.CatBoostClassifier(**params)
cat_model.fit(X_train, y_train,   
          eval_set=(X_test, y_test), 
          use_best_model=True, # True if we don't want to save trees created after iteration with the best validation score
          plot=True  
         );

pred =  cat_model.predict(X_test)


print(classification_report(y_test,pred))

print('test-set confusion matrix:\n') 
confusion_matrix_df = pd.DataFrame(confusion_matrix(y_test,pred))
confusion_matrix_df.columns = ['Predicted 0','Predicted 1'] 
confusion_matrix_df.index = ['actual 0','actual 1'] 
print(confusion_matrix_df)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.6487102	best: 0.6487102 (0)	total: 62.3ms	remaining: 1m 2s
200:	test: 0.7230742	best: 0.7473675 (27)	total: 516ms	remaining: 2.05s
400:	test: 0.7227562	best: 0.7473675 (27)	total: 954ms	remaining: 1.43s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.7473674912
bestIteration = 27

Shrink model to first 28 iterations.
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       566
           1       1.00      0.02      0.04        50

    accuracy                           0.92       616
   macro avg       0.96      0.51      0.50       616
weighted avg       0.93      0.92      0.88       616

test-set confusion matrix:

          Predicted 0  Predicted 1
actual 0          566            0
actual 1           49            1


In [15]:
cat_model.save_model('default_detector_catboost')

In [18]:
explainer = shap.Explainer(cat_model)
shap_values = explainer(X_test)

shapvaluedf = pd.DataFrame(shap_values.values, columns= X.columns)
shapdatadf = pd.DataFrame(shap_values.data, columns= X.columns)

shapdatadf.to_csv(r'shapdatadf.csv')
shapvaluedf.to_csv(r'shapvaluedf.csv')