<h1><center>Credit Risk Analysis - v2</center></h1>
 

### imports

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 999
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_curve, roc_auc_score
from scipy.stats import chi2_contingency,ttest_ind
from sklearn.utils import shuffle
import time

import warnings
warnings.filterwarnings('ignore')


In [2]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}


<IPython.core.display.Javascript object>

## Load Dataset


Let's take a quick look at the dataset.


In [3]:
cust_pd_full = pd.read_csv('./data/CUST_HISTORY_10000.csv')

# rows=1000
cust_pd = cust_pd_full # .head(rows)
print("There are " + str(len(cust_pd_full)) + " observations in the customer history dataset.")
print("There are " + str(len(cust_pd_full.columns)) + " variables in the dataset.")


There are 10000 observations in the customer history dataset.
There are 19 variables in the dataset.


# Data Preparation

In [4]:
cust_pd.head()

Unnamed: 0,EMI_TENURE,CREDIT_HISTORY,TRANSACTION_CATEGORY,TRANSACTION_AMOUNT,ACCOUNT_TYPE,ACCOUNT_AGE,STATE,IS_URBAN,IS_STATE_BORDER,HAS_CO_APPLICANT,HAS_GUARANTOR,OWN_REAL_ESTATE,OTHER_INSTALMENT_PLAN,OWN_RESIDENCE,NUMBER_CREDITS,RFM_SCORE,OWN_CAR,SHIP_INTERNATIONAL,IS_DEFAULT
0,77,EXISTING CREDITS PAID BACK,EDUCATION,27630,UNKNOWN/NONE,above 7 YRS,CT,NO,YES,YES,YES,NO,YES,NO,0,4,NO,NO,No
1,119,EXISTING CREDITS PAID BACK,ELECTRONICS,31314,above 1000 K USD,4 to 7 YRS,CT,YES,YES,YES,YES,NO,NO,YES,0,3,YES,YES,No
2,84,EXISTING CREDITS PAID BACK,FURNITURE,27630,above 1000 K USD,4 to 7 YRS,PA,NO,NO,YES,YES,YES,NO,YES,0,3,YES,YES,No
3,119,DELAY IN PAST,FURNITURE,33156,above 1000 K USD,up to 1 YR,PA,YES,NO,YES,NO,NO,NO,YES,0,3,NO,NO,Yes
4,105,DELAY IN PAST,FURNITURE,23946,above 1000 K USD,up to 1 YR,CT,NO,YES,YES,YES,YES,YES,NO,0,3,YES,YES,No


## Split Dataframe into Features and Label

In [5]:
cust_pd_Y = cust_pd[['IS_DEFAULT']]
cust_pd_X = cust_pd.drop(['IS_DEFAULT'],axis=1)

print('cust_pd_X.shape=%s, cust_pd_Y.shape=%s'% (cust_pd_X.shape, cust_pd_Y.shape))


cust_pd_X.shape=(10000, 18), cust_pd_Y.shape=(10000, 1)


## Transform Label

In [6]:
cust_pd_Y.head()

Unnamed: 0,IS_DEFAULT
0,No
1,No
2,No
3,Yes
4,No


In [7]:
le = LabelEncoder()
cust_pd_Y['IS_DEFAULT'] = le.fit_transform(cust_pd_Y['IS_DEFAULT'])
cust_pd_Y.head()

Unnamed: 0,IS_DEFAULT
0,0
1,0
2,0
3,1
4,0


## Transform Features

In [8]:
print(f'features df shape = {cust_pd_X.shape}')
cust_pd_X.head()

features df shape = (10000, 18)


Unnamed: 0,EMI_TENURE,CREDIT_HISTORY,TRANSACTION_CATEGORY,TRANSACTION_AMOUNT,ACCOUNT_TYPE,ACCOUNT_AGE,STATE,IS_URBAN,IS_STATE_BORDER,HAS_CO_APPLICANT,HAS_GUARANTOR,OWN_REAL_ESTATE,OTHER_INSTALMENT_PLAN,OWN_RESIDENCE,NUMBER_CREDITS,RFM_SCORE,OWN_CAR,SHIP_INTERNATIONAL
0,77,EXISTING CREDITS PAID BACK,EDUCATION,27630,UNKNOWN/NONE,above 7 YRS,CT,NO,YES,YES,YES,NO,YES,NO,0,4,NO,NO
1,119,EXISTING CREDITS PAID BACK,ELECTRONICS,31314,above 1000 K USD,4 to 7 YRS,CT,YES,YES,YES,YES,NO,NO,YES,0,3,YES,YES
2,84,EXISTING CREDITS PAID BACK,FURNITURE,27630,above 1000 K USD,4 to 7 YRS,PA,NO,NO,YES,YES,YES,NO,YES,0,3,YES,YES
3,119,DELAY IN PAST,FURNITURE,33156,above 1000 K USD,up to 1 YR,PA,YES,NO,YES,NO,NO,NO,YES,0,3,NO,NO
4,105,DELAY IN PAST,FURNITURE,23946,above 1000 K USD,up to 1 YR,CT,NO,YES,YES,YES,YES,YES,NO,0,3,YES,YES


### Label Encoder for categorical Columns

In [9]:
categoricalColumns = ['CREDIT_HISTORY', 'TRANSACTION_CATEGORY', 'ACCOUNT_TYPE', 'ACCOUNT_AGE',
                      'STATE', 'IS_URBAN', 'IS_STATE_BORDER', 'HAS_CO_APPLICANT', 'HAS_GUARANTOR',
                      'OWN_REAL_ESTATE', 'OTHER_INSTALMENT_PLAN',
                      'OWN_RESIDENCE', 'RFM_SCORE', 'OWN_CAR', 'SHIP_INTERNATIONAL']
cat_indexes =  [cust_pd_X.columns.get_loc(col) for col in categoricalColumns]
cat_indexes = np.asarray(cat_indexes)   # .ravel()

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

labelList=[]
labelEncoderList={}
for col in categoricalColumns:
    labenc = LabelEncoder()
    cust_pd_X[col] = labenc.fit_transform(cust_pd_X[col]) 
    labelEncoderList[col] = labenc
    newclas = [col + "_" + str(clas).replace(' ', '_') for clas in labenc.classes_ ]
    labelList.append(np.asarray(newclas))
labelEncoded_X = cust_pd_X
cust_pd_X.head()

Unnamed: 0,EMI_TENURE,CREDIT_HISTORY,TRANSACTION_CATEGORY,TRANSACTION_AMOUNT,ACCOUNT_TYPE,ACCOUNT_AGE,STATE,IS_URBAN,IS_STATE_BORDER,HAS_CO_APPLICANT,HAS_GUARANTOR,OWN_REAL_ESTATE,OTHER_INSTALMENT_PLAN,OWN_RESIDENCE,NUMBER_CREDITS,RFM_SCORE,OWN_CAR,SHIP_INTERNATIONAL
0,77,3,0,27630,2,3,0,0,1,1,1,0,1,0,0,3,0,0
1,119,3,1,31314,3,1,0,1,1,1,1,0,0,1,0,2,1,1
2,84,3,2,27630,3,1,3,0,0,1,1,1,0,1,0,2,1,1
3,119,2,2,33156,3,4,3,1,0,1,0,0,0,1,0,2,0,0
4,105,2,2,23946,3,4,0,0,1,1,1,1,1,0,0,2,1,1


In [11]:
# Store this model for later reference.
import pickle
pickle.dump(labelEncoderList, open('labelEncoderList.model', 'wb'))

In [12]:
collabelList = np.concatenate( labelList, axis=0 )

### One hot encoding for categorical Columns

In [59]:
key = categoricalColumns[0]
for clas in labelEncoderList[key].classes_:
    print(clas)

ALL CREDITS PAID BACK
CRITICAL ACCOUNT
DELAY IN PAST
EXISTING CREDITS PAID BACK
NONE TAKEN


In [61]:
OH_enc_test = OneHotEncoder(categorical_features=[1], handle_unknown='ignore', n_values="auto")
transformed_data = OH_enc_test.fit_transform(cust_pd_X.values)
print(cust_pd_X.ix[:,1].head())
pd.DataFrame(transformed_data.toarray()).ix[:,:4].head()

0    3
1    3
2    3
3    2
4    2
Name: CREDIT_HISTORY, dtype: int64


Unnamed: 0,0,1,2,3,4
0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0


In [62]:
OH_enc = OneHotEncoder(categorical_features=cat_indexes, handle_unknown='ignore', n_values="auto")
OH_enc.fit(cust_pd_X.values)

OneHotEncoder(categorical_features=array([ 1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 15, 16, 17]),
       categories=None, dtype=<class 'numpy.float64'>,
       handle_unknown='ignore', n_values='auto', sparse=True)

In [63]:
newcols = np.append(collabelList, ["EMI_TENURE", "TRANSACTION_AMOUNT", "NUMBER_CREDITS"])

In [64]:
cust_pd_X_enc = OH_enc.transform(cust_pd_X)
cust_pd_X_df = pd.DataFrame(cust_pd_X_enc.toarray(), columns=newcols)
cust_pd_X_df.head()

Unnamed: 0,CREDIT_HISTORY_ALL_CREDITS_PAID_BACK,CREDIT_HISTORY_CRITICAL_ACCOUNT,CREDIT_HISTORY_DELAY_IN_PAST,CREDIT_HISTORY_EXISTING_CREDITS_PAID_BACK,CREDIT_HISTORY_NONE_TAKEN,TRANSACTION_CATEGORY_EDUCATION,TRANSACTION_CATEGORY_ELECTRONICS,TRANSACTION_CATEGORY_FURNITURE,TRANSACTION_CATEGORY_NEW_CAR,TRANSACTION_CATEGORY_RETRAINING,TRANSACTION_CATEGORY_USED_CAR,ACCOUNT_TYPE_100_to_500_K_USD,ACCOUNT_TYPE_500_to_1000_K_USD,ACCOUNT_TYPE_UNKNOWN/NONE,ACCOUNT_TYPE_above_1000_K_USD,ACCOUNT_TYPE_up_to_100_K_USD,ACCOUNT_AGE_1_to_4_YRS,ACCOUNT_AGE_4_to_7_YRS,ACCOUNT_AGE_TBD,ACCOUNT_AGE_above_7_YRS,ACCOUNT_AGE_up_to_1_YR,STATE_CT,STATE_NJ,STATE_NY,STATE_PA,IS_URBAN_NO,IS_URBAN_YES,IS_STATE_BORDER_NO,IS_STATE_BORDER_YES,HAS_CO_APPLICANT_NO,HAS_CO_APPLICANT_YES,HAS_GUARANTOR_NO,HAS_GUARANTOR_YES,OWN_REAL_ESTATE_NO,OWN_REAL_ESTATE_YES,OTHER_INSTALMENT_PLAN_NO,OTHER_INSTALMENT_PLAN_YES,OWN_RESIDENCE_NO,OWN_RESIDENCE_YES,RFM_SCORE_1,RFM_SCORE_2,RFM_SCORE_3,RFM_SCORE_4,OWN_CAR_NO,OWN_CAR_YES,SHIP_INTERNATIONAL_NO,SHIP_INTERNATIONAL_YES,EMI_TENURE,TRANSACTION_AMOUNT,NUMBER_CREDITS
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,77.0,27630.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,119.0,31314.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,84.0,27630.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,119.0,33156.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,105.0,23946.0,0.0


In [65]:
# Store this model for later reference.
import pickle
pickle.dump(OH_enc, open('OneHotEncoder.model', 'wb'))

### Feature Normalization 

In [66]:
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(cust_pd_X_df)
features = min_max_scaler.transform(cust_pd_X_df)
features = normalize(features, axis=1, norm='l1')

cust_pd_X = pd.DataFrame(features,columns=newcols)
cust_pd_X.head()

Unnamed: 0,CREDIT_HISTORY_ALL_CREDITS_PAID_BACK,CREDIT_HISTORY_CRITICAL_ACCOUNT,CREDIT_HISTORY_DELAY_IN_PAST,CREDIT_HISTORY_EXISTING_CREDITS_PAID_BACK,CREDIT_HISTORY_NONE_TAKEN,TRANSACTION_CATEGORY_EDUCATION,TRANSACTION_CATEGORY_ELECTRONICS,TRANSACTION_CATEGORY_FURNITURE,TRANSACTION_CATEGORY_NEW_CAR,TRANSACTION_CATEGORY_RETRAINING,TRANSACTION_CATEGORY_USED_CAR,ACCOUNT_TYPE_100_to_500_K_USD,ACCOUNT_TYPE_500_to_1000_K_USD,ACCOUNT_TYPE_UNKNOWN/NONE,ACCOUNT_TYPE_above_1000_K_USD,ACCOUNT_TYPE_up_to_100_K_USD,ACCOUNT_AGE_1_to_4_YRS,ACCOUNT_AGE_4_to_7_YRS,ACCOUNT_AGE_TBD,ACCOUNT_AGE_above_7_YRS,ACCOUNT_AGE_up_to_1_YR,STATE_CT,STATE_NJ,STATE_NY,STATE_PA,IS_URBAN_NO,IS_URBAN_YES,IS_STATE_BORDER_NO,IS_STATE_BORDER_YES,HAS_CO_APPLICANT_NO,HAS_CO_APPLICANT_YES,HAS_GUARANTOR_NO,HAS_GUARANTOR_YES,OWN_REAL_ESTATE_NO,OWN_REAL_ESTATE_YES,OTHER_INSTALMENT_PLAN_NO,OTHER_INSTALMENT_PLAN_YES,OWN_RESIDENCE_NO,OWN_RESIDENCE_YES,RFM_SCORE_1,RFM_SCORE_2,RFM_SCORE_3,RFM_SCORE_4,OWN_CAR_NO,OWN_CAR_YES,SHIP_INTERNATIONAL_NO,SHIP_INTERNATIONAL_YES,EMI_TENURE,TRANSACTION_AMOUNT,NUMBER_CREDITS
0,0.0,0.0,0.0,0.062833,0.0,0.062833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062833,0.0,0.0,0.0,0.0,0.0,0.062833,0.0,0.062833,0.0,0.0,0.0,0.062833,0.0,0.0,0.062833,0.0,0.062833,0.0,0.062833,0.062833,0.0,0.0,0.062833,0.062833,0.0,0.0,0.0,0.0,0.062833,0.062833,0.0,0.062833,0.0,0.024435,0.03307,0.0
1,0.0,0.0,0.0,0.061148,0.0,0.0,0.061148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061148,0.0,0.0,0.061148,0.0,0.0,0.0,0.061148,0.0,0.0,0.0,0.0,0.061148,0.0,0.061148,0.0,0.061148,0.0,0.061148,0.061148,0.0,0.061148,0.0,0.0,0.061148,0.0,0.0,0.061148,0.0,0.0,0.061148,0.0,0.061148,0.044162,0.03862,0.0
2,0.0,0.0,0.0,0.062614,0.0,0.0,0.0,0.062614,0.0,0.0,0.0,0.0,0.0,0.0,0.062614,0.0,0.0,0.062614,0.0,0.0,0.0,0.0,0.0,0.0,0.062614,0.062614,0.0,0.062614,0.0,0.0,0.062614,0.0,0.062614,0.0,0.062614,0.062614,0.0,0.0,0.062614,0.0,0.0,0.062614,0.0,0.0,0.062614,0.0,0.062614,0.027829,0.032955,0.0
3,0.0,0.0,0.060952,0.0,0.0,0.0,0.0,0.060952,0.0,0.0,0.0,0.0,0.0,0.0,0.060952,0.0,0.0,0.0,0.0,0.0,0.060952,0.0,0.0,0.0,0.060952,0.0,0.060952,0.060952,0.0,0.0,0.060952,0.060952,0.0,0.060952,0.0,0.060952,0.0,0.0,0.060952,0.0,0.0,0.060952,0.0,0.060952,0.0,0.060952,0.0,0.044021,0.041704,0.0
4,0.0,0.0,0.062375,0.0,0.0,0.0,0.0,0.062375,0.0,0.0,0.0,0.0,0.0,0.0,0.062375,0.0,0.0,0.0,0.0,0.0,0.062375,0.062375,0.0,0.0,0.0,0.062375,0.0,0.0,0.062375,0.0,0.062375,0.0,0.062375,0.0,0.062375,0.0,0.062375,0.062375,0.0,0.0,0.0,0.062375,0.0,0.0,0.062375,0.0,0.062375,0.038118,0.026263,0.0


In [67]:
# Store this model for later reference.
import pickle
pickle.dump(min_max_scaler, open('min_max_scaler.model', 'wb'))

## Split Train and Test Dataset

In [68]:
label    = cust_pd_Y.values
features  = cust_pd_X.values

label = np.reshape(label,(-1,1))
# label = np.float32(label)
X_train,X_test,y_train,y_test = \
       train_test_split(features, label, test_size=0.3, random_state=42, stratify=label)
print(f'X_train.shape={X_train.shape} Y_train.shape={y_train.shape}')
print(f'X_test.shape={X_test.shape} Y_test.shape={y_test.shape}')

X_train.shape=(7000, 50) Y_train.shape=(7000, 1)
X_test.shape=(3000, 50) Y_test.shape=(3000, 1)


# Sklearn Training

In [69]:
# Import
from sklearn.linear_model import LogisticRegression
sklearn_lr = LogisticRegression(verbose=1)

In [70]:
# TRAIN
t0 = time.time()
sklearn_lr.fit(X_train, y_train)
print("[sklearn] Training time (s):  {0:.5f}".format(time.time()-t0))


[LibLinear][sklearn] Training time (s):  0.01801


In [71]:
# Evaluate log-loss on test set
# proba_test = sklearn_lr.predict_proba(X_test)
# from sklearn.metrics import log_loss
# logloss_sklearn = log_loss(y_test, proba_test)
# print("[sklearn] Logarithmic loss:   {0:.4f}".format(logloss_sklearn))
sklearn_prediction = sklearn_lr.predict(X_test)
print(f'sklearn ml accuracy score = {accuracy_score(y_test,sklearn_prediction)}')

sklearn ml accuracy score = 0.9473333333333334


In [72]:
# Store this model for later reference.
import pickle
pickle.dump(sklearn_lr, open('sklearn_lr.model', 'wb'))

### Prediction

In [85]:
import operator
def predict_listTopFeatures(X, model, colNames):
    mult = X.values[0]*model.coef_
    prediction = model.predict(X.values)
    prediction_prob = model.predict_proba(X.values)

    print("Possibility for defaulting : {} - confidence {:.1f}%".format(
        le.classes_[int(prediction[0])], prediction_prob[0][int(prediction[0])] * 100))
    print()
    print("Key Features which contributes to this outcome: \n")
    # Risk free - 0
    if int(prediction[0]) == 0:
        indx = np.where((mult < 0 ))
        negvalues = np.extract((mult < 0 ), mult)
        listwithindex = list(zip(indx[1], negvalues))
        listwithindex.sort(key = operator.itemgetter(1))
        for x in listwithindex[:3]:
            print(colNames[x[0]])
    else:
        indx = np.where((mult > 0 ))
        posvalues = np.extract((mult > 0 ), mult)
        listwithindex = list(zip(indx[1], posvalues))
        listwithindex.sort(key = operator.itemgetter(1), reverse = True)
        for idx, x in enumerate(listwithindex[:3]):
            print("{} - {}".format(idx+1, colNames[x[0]]))

def predict_returnTopFeatures(X, model, colNames):
    mult = X.values[0]*model.coef_
    prediction = model.predict(X.values)
    prediction_prob = model.predict_proba(X.values)

    ret1 = le.classes_[int(prediction[0])]
    ret2 = prediction_prob[0][int(prediction[0])] * 100

    ret3 = []
    # Risk free - 0
    if int(prediction[0]) == 0:
        indx = np.where((mult < 0 ))
        negvalues = np.extract((mult < 0 ), mult)
        listwithindex = list(zip(indx[1], negvalues))
        listwithindex.sort(key = operator.itemgetter(1))
        for x in listwithindex[:3]:
            ret3.append(colNames[x[0]])
    else:
        indx = np.where((mult > 0 ))
        posvalues = np.extract((mult > 0 ), mult)
        listwithindex = list(zip(indx[1], posvalues))
        listwithindex.sort(key = operator.itemgetter(1), reverse = True)
        for x in listwithindex[:3]:
            ret3.append(colNames[x[0]])
    return ret1, ret2, ret3
            
def process_testDF(test_data, catColumns, labelEncs,
                      onehotEnc, mm_scaler, colNames):
       test_data_X_ = test_data
       for col in catColumns:
           labenc = labelEncs[col]
           test_data_X_[col] = labenc.transform(test_data_X_[col]) 
       test_data_X1_enc = onehotEnc.transform(test_data_X_)

       test_data_X1_enc = pd.DataFrame(test_data_X1_enc.toarray(),columns=newcols)
       features = mm_scaler.transform(test_data_X1_enc)
       features = normalize(features, axis=1, norm='l1')

       test_data_X1_ndf = pd.DataFrame(features,columns=colNames)
       return test_data_X1_ndf



### Predict & List Top Features which influences the outcome

In [86]:
RecNum = 3
test_data = cust_pd.iloc[[RecNum]].drop(['IS_DEFAULT'],axis=1)
Actual=cust_pd['IS_DEFAULT'].iloc[[RecNum]]

test_data_X1_ndf = process_testDF(test_data,
                                    categoricalColumns, labelEncoderList, OH_enc,
                                    min_max_scaler, newcols
                                    )

predict_listTopFeatures(test_data_X1_ndf, sklearn_lr, newcols)
print("\nActual : {}".format(Actual.values))

Possibility for defaulting : Yes - confidence 87.6%

Key Features which contributes to this outcome: 

1 - STATE_PA
2 - CREDIT_HISTORY_DELAY_IN_PAST
3 - HAS_GUARANTOR_NO

Actual : ['Yes']


In [87]:
labEnc = labelEncoderList['TRANSACTION_CATEGORY']
trans_cat = labEnc.classes_
labEnc = labelEncoderList['STATE']
state_cat = labEnc.classes_

In [88]:
%%html
<style>

.mytext {
    font-family: "Times New Roman", Times, serif;
    font-size: 16px;
    font-weight: bold;
    color: #0066cc;
}
.headertext {
    font-family: "Times New Roman", Times, serif;
    font-size: 18px;
    font-weight: bold;
    color: #009900; 
    text-align: center;
}
.longtext {
    font-family: "Times New Roman", Times, serif;
    font-size: 8px;
    font-weight: bold;
    color: #009900; 
}
</style>

In [89]:
from ipywidgets import *
import random
from IPython.display import Javascript, display

table_style = {'description_width': 'initial'}
table2_style = {'width': '100%'}
table_layout = {'width':'220px', 'min_width':'220px', 'height':'28px', 'min_height':'28px'}
table2_layout = {'width':'200px', 'min_width':'200px', 'height':'28px', 'min_height':'28px'}
table3_layout = {'width':'100%', 'height':'28px', 'min_height':'28px'}
row_layout = {'width':'200px', 'min_width':'200px'}

RecNum = 3
test_data = cust_pd.iloc[[RecNum]].drop(['IS_DEFAULT'],axis=1)

row_1_0_widget = Button(description='Delay in Past',disabled=True,button_style='',tooltip='',icon='',layout=row_layout,style=table_style)
DELAY_IN_PAST = widgets.Checkbox(value=True if test_data['CREDIT_HISTORY'].values == "DELAY IN PAST" else False,
                                 description='',layout=table_layout,style=table_style)
row_2_0_widget = Button(description='Guarantor provided',disabled=True,button_style='',tooltip='',icon='',layout=row_layout,style=table_style)
HAS_GUARANTOR = widgets.Checkbox(value=True if test_data['HAS_GUARANTOR'].values == "YES" else False,
                                 description='',layout=table_layout,style=table_style)

row_3_0_widget = Button(description='Stays in rented house',disabled=True,button_style='',tooltip='',icon='',layout=row_layout,style=table_style)
OWN_RESIDENCE = widgets.Checkbox(value=True if test_data['OWN_RESIDENCE'].values == "YES" else False,
                                 description='',layout=table_layout,style=table_style)


row_4_0_widget = Button(description='Transaction Amount',disabled=True,button_style='',tooltip='',icon='',layout=row_layout,style=table_style)
TRANSACTION_AMOUNT = widgets.FloatSlider(min=1e3, max=5e4, step=1e3,
                                         description='', 
                                         value=test_data['TRANSACTION_AMOUNT'].values,
                                         continuous_update=False, layout=table_layout,style=table_style)
                                         
row_5_0_widget = Button(description='US State',disabled=True,button_style='',tooltip='',icon='',layout=row_layout,style=table_style)
STATE = widgets.Dropdown(options=state_cat,
                                        value=test_data['STATE'].values,
                                        description='', layout=table_layout,style=table_style)

row_6_0_widget = Button(description='Transaction Category',disabled=True,button_style='',tooltip='',icon='',layout=row_layout,style=table_style)
TRANSACTION_CATEGORY = widgets.Dropdown(options=trans_cat,
                                        value=test_data['TRANSACTION_CATEGORY'].values,
                                        description='', layout=table_layout,style=table_style)

row_7_0_widget = Button(description='EMI Tenure in months',disabled=True,button_style='',tooltip='',icon='',layout=row_layout,style=table_style)
EMI_TENURE = widgets.IntSlider(min=12, max=240, step=5,
                                         description='', 
                                         value=test_data['EMI_TENURE'].values,
                                         continuous_update=False,layout=table_layout,style=table_style)


r1 = widgets.Label("Possibility for defaulting:",layout=table2_layout,style=table_style)
r1.add_class("mytext")
v1 = widgets.Label()
r2 = widgets.Label("Confidence:",layout=table2_layout,style=table_style)
r2.add_class("mytext")
v2 = widgets.Label()
r3 = widgets.Label("Key Features:",layout=table2_layout,style=table_style)
r3.add_class("mytext")
v3 = widgets.Textarea(value="\n\n\n\n\n\n", disabled=True)
v3.add_class("longtext")

hbox1 = HBox([row_1_0_widget, DELAY_IN_PAST])
hbox2 = HBox([row_2_0_widget, HAS_GUARANTOR])
hbox3 = HBox([row_3_0_widget, OWN_RESIDENCE])
hbox4 = HBox([row_4_0_widget, TRANSACTION_AMOUNT])
hbox5 = HBox([row_5_0_widget, STATE ])
hbox6 = HBox([row_6_0_widget, TRANSACTION_CATEGORY])
hbox7 = HBox([row_7_0_widget, EMI_TENURE])

button = widgets.Button(button_style='success', description="Pick Random Data!", 
                        layout=table3_layout,style=table2_style)
hbox8 = HBox([button])


vbox1 = widgets.VBox([hbox1, hbox2, hbox3,
                   hbox4, hbox5, hbox6, hbox7,hbox8],
                   layout=Layout(padding='10px 10px 10px 10px', border='solid #b3d9ff'))
c3_r0 = widgets.Label("Predictions")
c3_r0.add_class("headertext")
c3_r1 = widgets.HBox([r1,v1])
c3_r2 = widgets.HBox([r2,v2])
c3_r3 = widgets.HBox([r3])
c3_r4 = widgets.HBox([v3])
label = widgets.Label()


vbox2 = widgets.VBox([c3_r0, c3_r1,c3_r2,c3_r3,c3_r4],
                    layout=Layout(padding='10px 10px 10px 10px', border='solid #b3d9ff'))
ui = widgets.HBox([vbox1, vbox2])

def fpredict(DELAY_IN_PAST, HAS_GUARANTOR, OWN_RESIDENCE, STATE, TRANSACTION_AMOUNT, TRANSACTION_CATEGORY, EMI_TENURE):
    test_data = cust_pd.iloc[[RecNum]].drop(['IS_DEFAULT'],axis=1)
    test_data['TRANSACTION_AMOUNT'] = TRANSACTION_AMOUNT
    test_data['EMI_TENURE'] = EMI_TENURE
    test_data['TRANSACTION_CATEGORY'] = TRANSACTION_CATEGORY
    test_data['STATE'] = STATE
    test_data['CREDIT_HISTORY'] = "DELAY IN PAST" if DELAY_IN_PAST else "EXISTING CREDITS PAID BACK"
    test_data['HAS_GUARANTOR'] = "YES" if HAS_GUARANTOR else "NO"
    test_data['OWN_RESIDENCE'] = "YES" if OWN_RESIDENCE else "NO"
    
    test_data_X1_ndf = process_testDF(test_data,
                                    categoricalColumns, labelEncoderList, OH_enc,
                                    min_max_scaler, newcols
                                    )

#     predict_listTopFeatures(test_data_X1_ndf, sklearn_lr, newcols)
    val1, val2, val3 =  predict_returnTopFeatures(test_data_X1_ndf, sklearn_lr, newcols)
    v1.value = val1
    v2.value = "{:.1f}%".format(val2)
    v3.value = "\n".join(["{} - {}".format(id+1, v) for id, v in enumerate(val3)])
    
out = widgets.interactive_output(fpredict,{'DELAY_IN_PAST': DELAY_IN_PAST,
                                           'TRANSACTION_AMOUNT' : TRANSACTION_AMOUNT,
                                           'TRANSACTION_CATEGORY': TRANSACTION_CATEGORY,
                                           'EMI_TENURE': EMI_TENURE,
                                           'HAS_GUARANTOR': HAS_GUARANTOR,
                                           'OWN_RESIDENCE': OWN_RESIDENCE,
                                           'STATE': STATE})

display(ui, out)


def randomizeData(b):
    global RecNum
    global test_data
    RecNum = random.randint(1,500)
    test_data = cust_pd.iloc[[RecNum]].drop(['IS_DEFAULT'],axis=1)
    Actual=cust_pd.iloc[[RecNum]]['IS_DEFAULT']
    amt = test_data['TRANSACTION_AMOUNT']
    cat = test_data['TRANSACTION_CATEGORY']
    state = test_data['STATE']
    past = test_data['CREDIT_HISTORY']
    tenure = test_data['EMI_TENURE']
    guarantor = test_data['HAS_GUARANTOR']
    own = test_data['OWN_RESIDENCE']
    label.value="Actual : {} ".format(Actual.values)
    TRANSACTION_AMOUNT.value = amt
    DELAY_IN_PAST.value = True if test_data['CREDIT_HISTORY'].values == "DELAY IN PAST" else False
    TRANSACTION_CATEGORY.value = cat.values
    STATE.value = state.values
    EMI_TENURE.value = tenure.values
    HAS_GUARANTOR.value = True if test_data['HAS_GUARANTOR'].values == "YES" else False
    OWN_RESIDENCE.value = True if test_data['OWN_RESIDENCE'].values == "YES" else False

button.on_click(randomizeData)

HBox(children=(VBox(children=(HBox(children=(Button(description='Delay in Past', disabled=True, layout=Layout(…

Output()

## Further Analysis

### Check how good is our Model

In [77]:
features_order = labelEncoded_X.columns.tolist()
labelEncoded_X.head()

Unnamed: 0,EMI_TENURE,CREDIT_HISTORY,TRANSACTION_CATEGORY,TRANSACTION_AMOUNT,ACCOUNT_TYPE,ACCOUNT_AGE,STATE,IS_URBAN,IS_STATE_BORDER,HAS_CO_APPLICANT,HAS_GUARANTOR,OWN_REAL_ESTATE,OTHER_INSTALMENT_PLAN,OWN_RESIDENCE,NUMBER_CREDITS,RFM_SCORE,OWN_CAR,SHIP_INTERNATIONAL
0,77,3,0,27630,2,3,0,0,1,1,1,0,1,0,0,3,0,0
1,119,3,1,31314,3,1,0,1,1,1,1,0,0,1,0,2,1,1
2,84,3,2,27630,3,1,3,0,0,1,1,1,0,1,0,2,1,1
3,119,2,2,33156,3,4,3,1,0,1,0,0,0,1,0,2,0,0
4,105,2,2,23946,3,4,0,0,1,1,1,1,1,0,0,2,1,1


In [90]:
# Process the data without Onehot Encoder as RandomForest works best with Categorical columns
min_max_scaler_ = MinMaxScaler()
cust_pd_X_ = min_max_scaler_.fit_transform(labelEncoded_X)
cust_pd_X_ = normalize(cust_pd_X_, axis=1, norm='l1')
cust_pd_X_ = pd.DataFrame(cust_pd_X_,columns=features_order)

features_  = cust_pd_X_.values
label_    = cust_pd_Y.values

label_ = np.reshape(label_,(-1,))

X_train_,X_test_,y_train_,y_test_ = \
       train_test_split(features_, label_, test_size=0.3, random_state=42, stratify=label_)

In [99]:
# from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

#Stratify split and train on 5 folds
# StratifiedKFold?
skf = StratifiedKFold(n_splits = 5)
# skf = StratifiedKFold(y_train_, n_folds=5)

counter = 1
for train_fold, test_fold in skf.split(X_train_, y_train_):
    random_forest.fit(X_train_[train_fold], y_train_[train_fold])
    print( str(counter) + ": ", random_forest.score(X_train_[test_fold], y_train_[test_fold]))
    counter += 1 

1:  0.9557458957887224
2:  0.9564596716630978
3:  0.9478571428571428
4:  0.9513938527519656
5:  0.949964260185847


### List Top feature which influence the Model

In [100]:
from sklearn.ensemble import ExtraTreesClassifier

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X_train_, y_train_)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(5):    # range(X_train.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, features_order[indices[f]], importances[indices[f]]))

Feature ranking:
1. feature STATE (0.192374)
2. feature HAS_GUARANTOR (0.149907)
3. feature IS_URBAN (0.129472)
4. feature OWN_RESIDENCE (0.104413)
5. feature IS_STATE_BORDER (0.083997)
