In [1]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
from category_encoders.one_hot import OneHotEncoder

from interpret import show
from interpret.blackbox import ShapKernel

import plotly.express as px

import pandas as pd
import numpy as np

In [2]:
final_dataset = pd.read_csv("./DATA/final_dataset.csv")
final_dataset.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DEF_30_RATIO_SOCIAL_CIRCLE,DEF_60_RATIO_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU,DEFAULT_COUNT
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,27.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
len(final_dataset)

307511

In [4]:
train_data = final_dataset.sample(n=200000)
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DEF_30_RATIO_SOCIAL_CIRCLE,DEF_60_RATIO_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU,DEFAULT_COUNT
283890,428769,0,Revolving loans,F,N,Y,1,45000.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
246361,385112,0,Cash loans,M,Y,Y,0,157500.0,215640.0,17419.5,...,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,6.0,0.0
47255,154720,0,Cash loans,M,N,Y,0,202500.0,327024.0,21028.5,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0
148320,271973,0,Cash loans,F,N,Y,0,157500.0,517500.0,21883.5,...,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,6.0,0.0
161129,286788,0,Cash loans,M,Y,Y,0,112500.0,234576.0,18936.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
train_data.reset_index(drop=True, inplace=True)
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DEF_30_RATIO_SOCIAL_CIRCLE,DEF_60_RATIO_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU,DEFAULT_COUNT
0,428769,0,Revolving loans,F,N,Y,1,45000.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,385112,0,Cash loans,M,Y,Y,0,157500.0,215640.0,17419.5,...,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,6.0,0.0
2,154720,0,Cash loans,M,N,Y,0,202500.0,327024.0,21028.5,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0
3,271973,0,Cash loans,F,N,Y,0,157500.0,517500.0,21883.5,...,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,6.0,0.0
4,286788,0,Cash loans,M,Y,Y,0,112500.0,234576.0,18936.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### One Hot Encode Categories

In [6]:
cat_col_idxs = [idx for idx,col_type in enumerate(train_data.dtypes) if col_type=='O']
train_data.iloc[:,cat_col_idxs].head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE
0,Revolving loans,F,N,Y,Unaccompanied,Working,Higher education,Married,House / apartment,Core staff,WEDNESDAY,Medicine
1,Cash loans,M,Y,Y,Unaccompanied,Working,Higher education,Single / not married,House / apartment,Unknown,THURSDAY,Business Entity Type 3
2,Cash loans,M,N,Y,Family,Working,Secondary / secondary special,Married,House / apartment,Laborers,THURSDAY,Business Entity Type 3
3,Cash loans,F,N,Y,Family,Working,Secondary / secondary special,Married,House / apartment,Core staff,SATURDAY,Trade: type 5
4,Cash loans,M,Y,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,Rented apartment,Laborers,WEDNESDAY,Business Entity Type 3


In [7]:
encoder = OneHotEncoder(handle_unknown='ignore', use_cat_names=True)

In [8]:
encoder_df = encoder.fit_transform(train_data.iloc[:,cat_col_idxs])
encoder_df.head()

Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
NumExpr defaulting to 8 threads.


Unnamed: 0,NAME_CONTRACT_TYPE_Revolving loans,NAME_CONTRACT_TYPE_Cash loans,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_Y,FLAG_OWN_REALTY_N,NAME_TYPE_SUITE_Unaccompanied,NAME_TYPE_SUITE_Family,...,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 8
0,1,0,1,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1,1,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,1,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
del_cols = [col for idx,col in enumerate(train_data.columns) if idx in cat_col_idxs]
for col in del_cols:
    del train_data[col]

In [10]:
train_data = train_data.join(encoder_df)
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 8
0,428769,0,1,45000.0,135000.0,6750.0,135000.0,0.018634,-13670,-6507,...,0,0,0,0,0,0,0,0,0,0
1,385112,0,0,157500.0,215640.0,17419.5,180000.0,0.006207,-9771,-285,...,0,0,0,0,0,0,0,0,0,0
2,154720,0,0,202500.0,327024.0,21028.5,270000.0,0.018634,-17086,-1253,...,0,0,0,0,0,0,0,0,0,0
3,271973,0,0,157500.0,517500.0,21883.5,517500.0,0.030755,-18135,-2426,...,0,0,0,0,0,0,0,0,0,0
4,286788,0,0,112500.0,234576.0,18936.0,202500.0,0.018029,-13750,-1210,...,0,0,0,0,0,0,0,0,0,0


### Train Model

In [11]:
x_cols = list(train_data.columns[2:])

X_data = train_data[x_cols]
y_data = train_data['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.30)

In [12]:
rfc = BalancedRandomForestClassifier(n_estimators=100)

In [13]:
rfc.fit(X_train, y_train)
y_pred=rfc.predict(X_test)

In [14]:
confusion_matrix(y_pred,y_test)

array([[37649,  1606],
       [17490,  3255]], dtype=int64)

In [15]:
accuracy_score(y_pred,y_test)

0.6817333333333333

In [16]:
precision, recall, _, _ = precision_recall_fscore_support(y_pred,y_test)
print("Precision: " + str(precision[1]))
print("Recall: " + str(recall[1]))

Precision: 0.669615305492697
Recall: 0.1569052783803326


### Feature Importance

In [17]:
imp_df = pd.DataFrame({'Feature':X_train.columns,'Importance':rfc.feature_importances_})
imp_df.sort_values(by='Importance', ascending=False, inplace=True)
px.bar(x='Importance', y='Feature', data_frame=imp_df.head(20), orientation='h')

### Local Explanation

In [18]:
shap = ShapKernel(predict_fn=rfc.predict_proba, data=X_train.sample(200))

Using 200 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


In [20]:
shap_local = shap.explain_local(X_test[:5], y_test[:5])

show(shap_local)

  0%|          | 0/5 [00:00<?, ?it/s]num_full_subsets = 1
remaining_weight_vector = [0.11293291 0.07585469 0.05732201 0.04620766 0.03880259 0.03351719
 0.02955666 0.02647946 0.02402065 0.02201165 0.02034007 0.01892809
 0.01772015 0.01667549 0.01576355 0.01496096 0.01424954 0.01361494
 0.0130457  0.01253251 0.01206779 0.01164526 0.01125968 0.01090667
 0.01058252 0.01028407 0.0100086  0.00975379 0.00951762 0.00929832
 0.00909436 0.00890439 0.00872723 0.00856182 0.00840723 0.00826263
 0.00812729 0.00800053 0.00788178 0.00777048 0.00766617 0.0075684
 0.00747678 0.00739097 0.00731063 0.00723548 0.00716525 0.0070997
 0.00703861 0.00698178 0.00692903 0.0068802  0.00683514 0.00679372
 0.00675581 0.0067213  0.0066901  0.00666212 0.00663729 0.00661552
 0.00659678 0.006581   0.00656815 0.00655818 0.00655109 0.00654684
 0.00654542]
num_paired_subset_sizes = 67
weight_left = 0.8163739342744519
np.sum(w_aug) = 136.00000000000003
np.sum(self.kernelWeights) = 1.0
phi = [ 0.0057911  -0.00682358 -0.0097

Generating mini dash
Generated mini dash
No overall plot to display: -1|ShapKernel_1
