<h1><center>Credit Risk Analysis - v3</center></h1>
 

### imports

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 999
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_curve, roc_auc_score
from scipy.stats import chi2_contingency,ttest_ind
from sklearn.utils import shuffle
import time
from utils import score_prediction

import warnings
warnings.filterwarnings('ignore')


In [2]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}


<IPython.core.display.Javascript object>

## Load Dataset


Let's take a quick look at the dataset.


In [3]:
cust_pd_full = pd.read_csv('./data/NEW_CUSTOMERS.csv')

cust_pd = cust_pd_full
print("There are " + str(len(cust_pd_full)) + " observations in the customer history dataset.")
print("There are " + str(len(cust_pd_full.columns)) + " variables in the dataset.")


There are 86 observations in the customer history dataset.
There are 18 variables in the dataset.


# Data Preparation

In [4]:
cust_pd.head()

Unnamed: 0,EMI_TENURE,CREDIT_HISTORY,TRANSACTION_CATEGORY,TRANSACTION_AMOUNT,ACCOUNT_TYPE,ACCOUNT_AGE,STATE,IS_URBAN,IS_STATE_BORDER,HAS_CO_APPLICANT,HAS_GUARANTOR,OWN_REAL_ESTATE,OTHER_INSTALMENT_PLAN,OWN_RESIDENCE,NUMBER_CREDITS,RFM_SCORE,OWN_CAR,SHIP_INTERNATIONAL
0,12,CRITICAL ACCOUNT,EDUCATION,2096,up to 100 K USD,4 to 7 YRS,NJ,NO,YES,NO,NO,YES,NO,YES,1,2,NO,NO
1,30,CRITICAL ACCOUNT,NEW CAR,5234,up to 100 K USD,TBD,NY,NO,NO,NO,NO,NO,NO,YES,2,4,NO,NO
2,10,CRITICAL ACCOUNT,NEW CAR,2241,up to 100 K USD,up to 1 YR,PA,NO,YES,NO,NO,YES,NO,NO,2,2,NO,YES
3,6,ALL CREDITS PAID BACK,NEW CAR,783,UNKNOWN/NONE,1 to 4 YRS,PA,NO,YES,NO,YES,YES,YES,YES,1,2,NO,NO
4,12,EXISTING CREDITS PAID BACK,ELECTRONICS,6468,UNKNOWN/NONE,TBD,NJ,NO,YES,NO,NO,NO,NO,YES,1,4,YES,NO


## Transform Features

In [5]:
cust_pd_X = cust_pd
print(f'features df shape = {cust_pd_X.shape}')
cust_pd_X.head()

features df shape = (86, 18)


Unnamed: 0,EMI_TENURE,CREDIT_HISTORY,TRANSACTION_CATEGORY,TRANSACTION_AMOUNT,ACCOUNT_TYPE,ACCOUNT_AGE,STATE,IS_URBAN,IS_STATE_BORDER,HAS_CO_APPLICANT,HAS_GUARANTOR,OWN_REAL_ESTATE,OTHER_INSTALMENT_PLAN,OWN_RESIDENCE,NUMBER_CREDITS,RFM_SCORE,OWN_CAR,SHIP_INTERNATIONAL
0,12,CRITICAL ACCOUNT,EDUCATION,2096,up to 100 K USD,4 to 7 YRS,NJ,NO,YES,NO,NO,YES,NO,YES,1,2,NO,NO
1,30,CRITICAL ACCOUNT,NEW CAR,5234,up to 100 K USD,TBD,NY,NO,NO,NO,NO,NO,NO,YES,2,4,NO,NO
2,10,CRITICAL ACCOUNT,NEW CAR,2241,up to 100 K USD,up to 1 YR,PA,NO,YES,NO,NO,YES,NO,NO,2,2,NO,YES
3,6,ALL CREDITS PAID BACK,NEW CAR,783,UNKNOWN/NONE,1 to 4 YRS,PA,NO,YES,NO,YES,YES,YES,YES,1,2,NO,NO
4,12,EXISTING CREDITS PAID BACK,ELECTRONICS,6468,UNKNOWN/NONE,TBD,NJ,NO,YES,NO,NO,NO,NO,YES,1,4,YES,NO


### Label Encoder for categorical Columns

In [6]:
categoricalColumns = ['CREDIT_HISTORY', 'TRANSACTION_CATEGORY', 'ACCOUNT_TYPE', 'ACCOUNT_AGE',
                      'STATE', 'IS_URBAN', 'IS_STATE_BORDER', 'HAS_CO_APPLICANT', 'HAS_GUARANTOR',
                      'OWN_REAL_ESTATE', 'OTHER_INSTALMENT_PLAN',
                      'OWN_RESIDENCE', 'RFM_SCORE', 'OWN_CAR', 'SHIP_INTERNATIONAL']
cat_indexes =  [cust_pd_X.columns.get_loc(col) for col in categoricalColumns]
cat_indexes = np.asarray(cat_indexes)   # .ravel()

In [7]:
# Load the labelEncoderList Model
import pickle
labelEncoderList = pickle.load(open('labelEncoderList.model', 'rb'))

In [8]:
for col in categoricalColumns:
   labenc = labelEncoderList[col]
   cust_pd_X[col] = labenc.transform(cust_pd_X[col]) 

In [9]:
cust_pd_X.head()

Unnamed: 0,EMI_TENURE,CREDIT_HISTORY,TRANSACTION_CATEGORY,TRANSACTION_AMOUNT,ACCOUNT_TYPE,ACCOUNT_AGE,STATE,IS_URBAN,IS_STATE_BORDER,HAS_CO_APPLICANT,HAS_GUARANTOR,OWN_REAL_ESTATE,OTHER_INSTALMENT_PLAN,OWN_RESIDENCE,NUMBER_CREDITS,RFM_SCORE,OWN_CAR,SHIP_INTERNATIONAL
0,12,1,0,2096,4,1,1,0,1,0,0,1,0,1,1,1,0,0
1,30,1,3,5234,4,2,2,0,0,0,0,0,0,1,2,3,0,0
2,10,1,3,2241,4,4,3,0,1,0,0,1,0,0,2,1,0,1
3,6,0,3,783,2,0,3,0,1,0,1,1,1,1,1,1,0,0
4,12,3,1,6468,2,2,1,0,1,0,0,0,0,1,1,3,1,0


### One hot encoding for categorical Columns

In [10]:
# Load the OneHotEncoder Model
import pickle
OH_enc = pickle.load(open('OneHotEncoder.model', 'rb'))

In [11]:
cust_pd_X_enc = OH_enc.transform(cust_pd_X)
cust_pd_X_df = pd.DataFrame(cust_pd_X_enc.toarray())
cust_pd_X_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,12.0,2096.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,30.0,5234.0,2.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,10.0,2241.0,2.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,6.0,783.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,12.0,6468.0,1.0


### Feature Normalization 

In [12]:
# Load the labelEncoderList Model
import pickle
min_max_scaler = pickle.load(open('min_max_scaler.model', 'rb'))

In [13]:
features = min_max_scaler.transform(cust_pd_X_df)
features = normalize(features, axis=1, norm='l1')

cust_pd_X = pd.DataFrame(features)
cust_pd_X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
0,0.0,0.061236,0.0,0.0,0.0,0.061236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061236,0.0,0.061236,0.0,0.0,0.0,0.0,0.061236,0.0,0.0,0.061236,0.0,0.0,0.061236,0.061236,0.0,0.061236,0.0,0.0,0.061236,0.061236,0.0,0.0,0.061236,0.0,0.061236,0.0,0.0,0.061236,0.0,0.061236,0.0,-0.007776,-0.012447,0.061236
1,0.0,0.058379,0.0,0.0,0.0,0.0,0.0,0.0,0.058379,0.0,0.0,0.0,0.0,0.0,0.0,0.058379,0.0,0.0,0.058379,0.0,0.0,0.0,0.0,0.058379,0.0,0.058379,0.0,0.058379,0.0,0.058379,0.0,0.058379,0.0,0.058379,0.0,0.058379,0.0,0.0,0.058379,0.0,0.0,0.0,0.058379,0.058379,0.0,0.058379,0.0,0.000927,-0.006632,0.116758
2,0.0,0.057664,0.0,0.0,0.0,0.0,0.0,0.0,0.057664,0.0,0.0,0.0,0.0,0.0,0.0,0.057664,0.0,0.0,0.0,0.0,0.057664,0.0,0.0,0.0,0.057664,0.057664,0.0,0.0,0.057664,0.057664,0.0,0.057664,0.0,0.0,0.057664,0.057664,0.0,0.057664,0.0,0.0,0.057664,0.0,0.0,0.057664,0.0,0.0,0.057664,-0.008238,-0.011482,0.115327
3,0.060918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060918,0.0,0.0,0.0,0.0,0.060918,0.0,0.0,0.060918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060918,0.060918,0.0,0.0,0.060918,0.060918,0.0,0.0,0.060918,0.0,0.060918,0.0,0.060918,0.0,0.060918,0.0,0.060918,0.0,0.0,0.060918,0.0,0.060918,0.0,-0.010637,-0.014668,0.060918
4,0.0,0.0,0.0,0.061708,0.0,0.0,0.061708,0.0,0.0,0.0,0.0,0.0,0.0,0.061708,0.0,0.0,0.0,0.0,0.061708,0.0,0.0,0.0,0.061708,0.0,0.0,0.061708,0.0,0.0,0.061708,0.061708,0.0,0.061708,0.0,0.061708,0.0,0.061708,0.0,0.0,0.061708,0.0,0.0,0.0,0.061708,0.0,0.061708,0.061708,0.0,-0.007836,-0.004835,0.061708


# Sklearn Prediction

In [14]:
# Import
from sklearn.linear_model import LogisticRegression
import pickle
sklearn_lr = pickle.load(open('sklearn_lr.model', 'rb'))

In [15]:
sklearn_prediction = sklearn_lr.predict(cust_pd_X)

In [16]:
# Verify the results
score_prediction(sklearn_prediction)

Pass !!
