## Classification Model

In [1]:
#import things
import os
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
## load data

data = pd.read_csv("application_record-Copy1.csv")
data.shape
data.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [3]:
## check null values 

data.isnull().sum()

ID                          0
CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_MOBIL                  0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        134203
CNT_FAM_MEMBERS             0
dtype: int64

In [4]:
data[data.isnull().any(axis=1)]

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
7,5008812,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,-22464,365243,1,0,0,0,,1.0
8,5008813,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,-22464,365243,1,0,0,0,,1.0
9,5008814,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,-22464,365243,1,0,0,0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438549,6840098,F,N,Y,0,135000.0,Pensioner,Secondary / secondary special,Separated,House / apartment,-22717,365243,1,0,0,0,,1.0
438550,6840100,F,N,Y,0,135000.0,Pensioner,Secondary / secondary special,Separated,House / apartment,-22717,365243,1,0,0,0,,1.0
438551,6840102,F,N,Y,0,135000.0,Pensioner,Secondary / secondary special,Separated,House / apartment,-22717,365243,1,0,0,0,,1.0
438552,6840104,M,N,Y,0,135000.0,Pensioner,Secondary / secondary special,Separated,House / apartment,-22717,365243,1,0,0,0,,1.0


In [5]:
data = data.drop(['OCCUPATION_TYPE'], axis = 1)


In [6]:
data.dtypes

ID                       int64
CODE_GENDER             object
FLAG_OWN_CAR            object
FLAG_OWN_REALTY         object
CNT_CHILDREN             int64
AMT_INCOME_TOTAL       float64
NAME_INCOME_TYPE        object
NAME_EDUCATION_TYPE     object
NAME_FAMILY_STATUS      object
NAME_HOUSING_TYPE       object
DAYS_BIRTH               int64
DAYS_EMPLOYED            int64
FLAG_MOBIL               int64
FLAG_WORK_PHONE          int64
FLAG_PHONE               int64
FLAG_EMAIL               int64
CNT_FAM_MEMBERS        float64
dtype: object

In [7]:
# create dictionaries to map fields to numeric values
gender_values = {'F' : 0, 'M' : 1} 
own_car = {'N' : 0, 'Y' : 1}
own_realty = {'N' : 0, 'Y' : 1}
education_values = {'Lower secondary' : 0, 'Secondary / secondary special' : 1, 'Incomplete higher': 2, 'Higher education': 3, 'Academic degree': 4}
fam_stat = {'Single / not married': 0, 'Separated': 1, 'Widow':2, "Civil marriage": 3, 'Married': 4 }
work = {'Student': 0, 'Pensioner': 1, 'State servant': 2, 'Commercial associate': 3, 'State servant': 4, 'Working': 5}
house = {'With parents': 0, 'Municipal apartment': 1, 'Co-op apartment': 2, 'Office apartment': 3, 'Rented apartment': 4, 'House / apartment': 5}

In [12]:
data.replace({'CODE_GENDER': gender_values, 'FLAG_OWN_CAR': own_car, 'FLAG_OWN_REALTY': own_realty, 'NAME_EDUCATION_TYPE': education_values, 'NAME_FAMILY_STATUS': fam_stat, 'NAME_INCOME_TYPE': work, 'NAME_HOUSING_TYPE': house}, inplace=True)

In [13]:
data.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS
0,5008804,1,1,1,0,427500.0,5,3,3,4,-12005,-4542,1,1,0,0,2.0
1,5008805,1,1,1,0,427500.0,5,3,3,4,-12005,-4542,1,1,0,0,2.0
2,5008806,1,1,1,0,112500.0,5,1,4,5,-21474,-1134,1,0,0,0,2.0
3,5008808,0,0,1,0,270000.0,3,1,0,5,-19110,-3051,1,0,1,1,1.0
4,5008809,0,0,1,0,270000.0,3,1,0,5,-19110,-3051,1,0,1,1,1.0


In [14]:
#find unique values and categorize them 
data['NAME_INCOME_TYPE'].unique()
# data['NAME_EDUCATION_TYPE'].unique()
# data['NAME_FAMILY_STATUS'].unique()
data['NAME_HOUSING_TYPE'].unique()

array([4, 5, 1, 0, 2, 3], dtype=int64)

In [15]:
# change string values to categorical data
data['NAME_INCOME_TYPE'] = data['NAME_INCOME_TYPE'].astype('category')
data['NAME_EDUCATION_TYPE'] = data['NAME_EDUCATION_TYPE'].astype('category')
data['NAME_FAMILY_STATUS'] = data['NAME_FAMILY_STATUS'].astype('category')
data['NAME_HOUSING_TYPE'] = data['NAME_HOUSING_TYPE'].astype('category')

# change income total and count of family members to integers
data['CNT_CHILDREN'] = data['CNT_CHILDREN'].astype('int')
data['AMT_INCOME_TOTAL'] = data['AMT_INCOME_TOTAL'].astype('int')
data['CNT_FAM_MEMBERS'] = data['CNT_FAM_MEMBERS'].astype('int')


In [16]:
data.dtypes

ID                        int64
CODE_GENDER               int64
FLAG_OWN_CAR              int64
FLAG_OWN_REALTY           int64
CNT_CHILDREN              int32
AMT_INCOME_TOTAL          int32
NAME_INCOME_TYPE       category
NAME_EDUCATION_TYPE    category
NAME_FAMILY_STATUS     category
NAME_HOUSING_TYPE      category
DAYS_BIRTH                int64
DAYS_EMPLOYED             int64
FLAG_MOBIL                int64
FLAG_WORK_PHONE           int64
FLAG_PHONE                int64
FLAG_EMAIL                int64
CNT_FAM_MEMBERS           int32
dtype: object

In [17]:
data.isnull().sum()

ID                     0
CODE_GENDER            0
FLAG_OWN_CAR           0
FLAG_OWN_REALTY        0
CNT_CHILDREN           0
AMT_INCOME_TOTAL       0
NAME_INCOME_TYPE       0
NAME_EDUCATION_TYPE    0
NAME_FAMILY_STATUS     0
NAME_HOUSING_TYPE      0
DAYS_BIRTH             0
DAYS_EMPLOYED          0
FLAG_MOBIL             0
FLAG_WORK_PHONE        0
FLAG_PHONE             0
FLAG_EMAIL             0
CNT_FAM_MEMBERS        0
dtype: int64

In [18]:
## merge CSV that has been cleaned
cr = pd.read_csv('CRUpdate2.csv')
cr.head()

cr.keys()

del cr['Unnamed: 0']
cr.head()
cr1 = cr[['ID', 'APPROVAL']]
cr1.head()

Unnamed: 0,ID,APPROVAL
0,5001711,YES
1,5001712,NO
2,5001713,YES
3,5001714,YES
4,5001715,YES


In [19]:
new = data.merge(cr1, on='ID', how='inner')
new.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,APPROVAL
0,5008804,1,1,1,0,427500,5,3,3,4,-12005,-4542,1,1,0,0,2,YES
1,5008805,1,1,1,0,427500,5,3,3,4,-12005,-4542,1,1,0,0,2,YES
2,5008806,1,1,1,0,112500,5,1,4,5,-21474,-1134,1,0,0,0,2,NO
3,5008808,0,0,1,0,270000,3,1,0,5,-19110,-3051,1,0,1,1,1,YES
4,5008809,0,0,1,0,270000,3,1,0,5,-19110,-3051,1,0,1,1,1,YES


In [20]:
# loan_approved = {'NO' : 0, 'YES' : 1}
# new.replace({'APPROVAL': loan_approved}, inplace=True)

In [21]:
new['APPROVAL'] = new['APPROVAL'].astype('category')

new.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,APPROVAL
0,5008804,1,1,1,0,427500,5,3,3,4,-12005,-4542,1,1,0,0,2,YES
1,5008805,1,1,1,0,427500,5,3,3,4,-12005,-4542,1,1,0,0,2,YES
2,5008806,1,1,1,0,112500,5,1,4,5,-21474,-1134,1,0,0,0,2,NO
3,5008808,0,0,1,0,270000,3,1,0,5,-19110,-3051,1,0,1,1,1,YES
4,5008809,0,0,1,0,270000,3,1,0,5,-19110,-3051,1,0,1,1,1,YES


### TRAIN / TEST SPLIT

In [22]:
#store target variable 
y = new['APPROVAL']
X = new.drop(['APPROVAL'], axis = 1)


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)
X_train.shape

(27342, 17)

In [24]:
X_train.isnull().sum()

ID                     0
CODE_GENDER            0
FLAG_OWN_CAR           0
FLAG_OWN_REALTY        0
CNT_CHILDREN           0
AMT_INCOME_TOTAL       0
NAME_INCOME_TYPE       0
NAME_EDUCATION_TYPE    0
NAME_FAMILY_STATUS     0
NAME_HOUSING_TYPE      0
DAYS_BIRTH             0
DAYS_EMPLOYED          0
FLAG_MOBIL             0
FLAG_WORK_PHONE        0
FLAG_PHONE             0
FLAG_EMAIL             0
CNT_FAM_MEMBERS        0
dtype: int64

In [25]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

param_grid = {"n_estimators" : [10, 20, 50, 100],
             "max_depth" : [None, 6, 8, 10],
             "max_leaf_nodes": [None, 5, 10, 20], 
             "min_impurity_split": [0.1, 0.2, 0.3, 0.4]}

In [26]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(model, param_grid=param_grid, cv=3, verbose=2)

In [27]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 256 candidates, totalling 768 fits
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10, total=   0.5s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10, total=   0.4s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10, total=   0.4s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20, total=   0.9s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20, total=   0.8s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20, total=   1.6s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=50 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=50, total=   2.

[Parallel(n_jobs=1)]: Done 768 out of 768 | elapsed:  9.9min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [28]:
print(f"Best parameters: {grid.best_params_}")
print(f"Test set score: {grid.score(X_test, y_test)}")

Best parameters: {'max_depth': None, 'max_leaf_nodes': None, 'min_impurity_split': 0.3, 'n_estimators': 100}
Test set score: 0.7156335710367526


In [29]:
print(f"Train set score: {grid.score(X_train, y_train)}")

Train set score: 0.9128081340062907
