In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

In [5]:
application_details = pd.read_csv('./data/application_record 2.csv')
credit_records = pd.read_csv('./data/credit_record.csv')

In [6]:
application_details.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,438557.0,6022176.0,571637.023257,5008804.0,5609375.0,6047745.0,6456971.0,7999952.0
CNT_CHILDREN,438557.0,0.4273903,0.724882,0.0,0.0,0.0,1.0,19.0
AMT_INCOME_TOTAL,438557.0,187524.3,110086.853066,26100.0,121500.0,160780.5,225000.0,6750000.0
DAYS_BIRTH,438557.0,-15997.9,4185.030007,-25201.0,-19483.0,-15630.0,-12514.0,-7489.0
DAYS_EMPLOYED,438557.0,60563.68,138767.799647,-17531.0,-3103.0,-1467.0,-371.0,365243.0
FLAG_MOBIL,438557.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
FLAG_WORK_PHONE,438557.0,0.2061328,0.404527,0.0,0.0,0.0,0.0,1.0
FLAG_PHONE,438557.0,0.287771,0.452724,0.0,0.0,0.0,1.0,1.0
FLAG_EMAIL,438557.0,0.1082071,0.310642,0.0,0.0,0.0,0.0,1.0
CNT_FAM_MEMBERS,438557.0,2.194465,0.897207,1.0,2.0,2.0,3.0,20.0


# Inspecting Data

In [7]:
credit_records.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,1048575.0,5068286.0,46150.578505,5001711.0,5023644.0,5062104.0,5113856.0,5150487.0
MONTHS_BALANCE,1048575.0,-19.137,14.023498,-60.0,-29.0,-17.0,-7.0,0.0


In [8]:
application_details.tail()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
438552,6840104,M,N,Y,0,135000.0,Pensioner,Secondary / secondary special,Separated,House / apartment,-22717,365243,1,0,0,0,,1.0
438553,6840222,F,N,N,0,103500.0,Working,Secondary / secondary special,Single / not married,House / apartment,-15939,-3007,1,0,0,0,Laborers,1.0
438554,6841878,F,N,N,0,54000.0,Commercial associate,Higher education,Single / not married,With parents,-8169,-372,1,1,0,0,Sales staff,1.0
438555,6842765,F,N,Y,0,72000.0,Pensioner,Secondary / secondary special,Married,House / apartment,-21673,365243,1,0,0,0,,2.0
438556,6842885,F,N,Y,0,121500.0,Working,Secondary / secondary special,Married,House / apartment,-18858,-1201,1,0,1,0,Sales staff,2.0


In [9]:
credit_records.tail()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
1048570,5150487,-25,C
1048571,5150487,-26,C
1048572,5150487,-27,C
1048573,5150487,-28,C
1048574,5150487,-29,C


In [10]:
np.unique(credit_records['STATUS'])

array(['0', '1', '2', '3', '4', '5', 'C', 'X'], dtype=object)

- According to source data, credit_records has the following information:
Feature name	Explanation	Remarks
ID	Client number	
-- 0 is the current month, 
-- -1 is the previous month, and so on
STATUS	Status	
- 0: 1-29 days past due 
- 1: 30-59 days past due 
- 2: 60-89 days overdue 
- 3: 90-119 days overdue 
- 4: 120-149 days overdue 
- 5: Overdue or bad debts, write-offs for more than 150 days 
- C: paid off that month 
- X: No loan for the month

# Data Wrangling

 **Converting credit_record status to binary(1/0) data**


In [29]:
convert_to = {'C' : 'Good_Debt', 'X' : 'Good_Debt', '0' : 'Good_Debt', '1' : 'Bad_Debt', '2' : 'Bad_Debt', '3' : 'Bad_Debt', '4' : 'Bad_Debt', '5' : 'Bad_Debt'}
credit_records.replace({'STATUS' : convert_to}, inplace=True)

In [30]:
credit_records[credit_records['STATUS'] == 'Neutral_Debt']

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
150,5001718,-8,Neutral_Debt
154,5001718,-12,Neutral_Debt
224,5001720,0,Neutral_Debt
227,5001720,-3,Neutral_Debt
228,5001720,-4,Neutral_Debt
...,...,...,...
1048241,5150464,-9,Neutral_Debt
1048242,5150464,-10,Neutral_Debt
1048243,5150464,-11,Neutral_Debt
1048244,5150464,-12,Neutral_Debt


- Convert status into approval status

In [31]:
credit_records.loc[credit_records['STATUS'] == 'Good_Debt', 'APPROVAL_STATUS'] = 1
credit_records.loc[credit_records['STATUS'] == 'Bad_Debt', 'APPROVAL_STATUS'] = 0


In [32]:
credit_records

Unnamed: 0,ID,MONTHS_BALANCE,STATUS,APPROVAL_STATUS
0,5001711,0,Good_Debt,1.0
1,5001711,-1,Good_Debt,1.0
2,5001711,-2,Good_Debt,1.0
3,5001711,-3,Good_Debt,1.0
4,5001712,0,Good_Debt,1.0
...,...,...,...,...
1048570,5150487,-25,Good_Debt,1.0
1048571,5150487,-26,Good_Debt,1.0
1048572,5150487,-27,Good_Debt,1.0
1048573,5150487,-28,Good_Debt,1.0


In [33]:
application_details.isnull().sum()

ID                          0
CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_MOBIL                  0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        134203
CNT_FAM_MEMBERS             0
dtype: int64

- about one third of the record has null value in OCCUPATION_TYPE column

In [34]:
application_details = application_details.dropna()

In [35]:
application_details.isnull().sum()

ID                     0
CODE_GENDER            0
FLAG_OWN_CAR           0
FLAG_OWN_REALTY        0
CNT_CHILDREN           0
AMT_INCOME_TOTAL       0
NAME_INCOME_TYPE       0
NAME_EDUCATION_TYPE    0
NAME_FAMILY_STATUS     0
NAME_HOUSING_TYPE      0
DAYS_BIRTH             0
DAYS_EMPLOYED          0
FLAG_MOBIL             0
FLAG_WORK_PHONE        0
FLAG_PHONE             0
FLAG_EMAIL             0
OCCUPATION_TYPE        0
CNT_FAM_MEMBERS        0
dtype: int64

In [36]:
application_details

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
5,5008810,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
6,5008811,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438541,6837707,M,N,Y,0,202500.0,Working,Higher education,Civil marriage,House / apartment,-13510,-2309,1,1,0,0,Laborers,2.0
438548,6839936,M,Y,Y,1,135000.0,Working,Secondary / secondary special,Married,House / apartment,-12569,-2095,1,0,0,0,Laborers,3.0
438553,6840222,F,N,N,0,103500.0,Working,Secondary / secondary special,Single / not married,House / apartment,-15939,-3007,1,0,0,0,Laborers,1.0
438554,6841878,F,N,N,0,54000.0,Commercial associate,Higher education,Single / not married,With parents,-8169,-372,1,1,0,0,Sales staff,1.0


**Convert days data to readable years format**

In [42]:
application_details['AGE'] = np.ceil(pd.to_timedelta(application_details['DAYS_BIRTH'], unit='D').dt.days / -365.25)

In [43]:
application_details.drop('DAYS_BIRTH', axis=1, inplace=True)

In [44]:
# Checking Data after converting DAYS_BIRTH to AGES
application_details.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 304354 entries, 2 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   304354 non-null  int64  
 1   CODE_GENDER          304354 non-null  object 
 2   FLAG_OWN_CAR         304354 non-null  object 
 3   FLAG_OWN_REALTY      304354 non-null  object 
 4   CNT_CHILDREN         304354 non-null  int64  
 5   AMT_INCOME_TOTAL     304354 non-null  float64
 6   NAME_INCOME_TYPE     304354 non-null  object 
 7   NAME_EDUCATION_TYPE  304354 non-null  object 
 8   NAME_FAMILY_STATUS   304354 non-null  object 
 9   NAME_HOUSING_TYPE    304354 non-null  object 
 10  DAYS_EMPLOYED        304354 non-null  int64  
 11  FLAG_MOBIL           304354 non-null  int64  
 12  FLAG_WORK_PHONE      304354 non-null  int64  
 13  FLAG_PHONE           304354 non-null  int64  
 14  FLAG_EMAIL           304354 non-null  int64  
 15  OCCUPATION_TYPE  

In [45]:
application_details.loc[(application_details['DAYS_EMPLOYED'] >= 0), 'DAYS_EMPLOYED'] = 0
application_details['YEARS_EMPLOYED'] = np.ceil(pd.to_timedelta(application_details['DAYS_EMPLOYED'], unit='D').dt.days / -365.25)


In [46]:
application_details.drop('DAYS_EMPLOYED', axis=1, inplace=True)


In [47]:
# Checking Data after converting DAYS_BIRTH to AGES
application_details.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 304354 entries, 2 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   304354 non-null  int64  
 1   CODE_GENDER          304354 non-null  object 
 2   FLAG_OWN_CAR         304354 non-null  object 
 3   FLAG_OWN_REALTY      304354 non-null  object 
 4   CNT_CHILDREN         304354 non-null  int64  
 5   AMT_INCOME_TOTAL     304354 non-null  float64
 6   NAME_INCOME_TYPE     304354 non-null  object 
 7   NAME_EDUCATION_TYPE  304354 non-null  object 
 8   NAME_FAMILY_STATUS   304354 non-null  object 
 9   NAME_HOUSING_TYPE    304354 non-null  object 
 10  FLAG_MOBIL           304354 non-null  int64  
 11  FLAG_WORK_PHONE      304354 non-null  int64  
 12  FLAG_PHONE           304354 non-null  int64  
 13  FLAG_EMAIL           304354 non-null  int64  
 14  OCCUPATION_TYPE      304354 non-null  object 
 15  CNT_FAM_MEMBERS  

In [48]:
def Cat_to_Num(features):
    for feature in features:
        feature_list = list(np.unique(application_details[feature]))
        feature_dict = {}
        for i in range(len(feature_list)):
                       feature_dict[feature_list[i]] = i
        application_details.replace({feature : feature_dict}, inplace=True)
        print(feature, '-->', feature_dict)

In [49]:
categorical_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
                        'NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS', 'OCCUPATION_TYPE',
                        'NAME_HOUSING_TYPE', 'NAME_EDUCATION_TYPE']
Cat_to_Num(categorical_features)

CODE_GENDER --> {'F': 0, 'M': 1}
FLAG_OWN_CAR --> {'N': 0, 'Y': 1}
FLAG_OWN_REALTY --> {'N': 0, 'Y': 1}
NAME_INCOME_TYPE --> {'Commercial associate': 0, 'Pensioner': 1, 'State servant': 2, 'Student': 3, 'Working': 4}
NAME_FAMILY_STATUS --> {'Civil marriage': 0, 'Married': 1, 'Separated': 2, 'Single / not married': 3, 'Widow': 4}
OCCUPATION_TYPE --> {'Accountants': 0, 'Cleaning staff': 1, 'Cooking staff': 2, 'Core staff': 3, 'Drivers': 4, 'HR staff': 5, 'High skill tech staff': 6, 'IT staff': 7, 'Laborers': 8, 'Low-skill Laborers': 9, 'Managers': 10, 'Medicine staff': 11, 'Private service staff': 12, 'Realty agents': 13, 'Sales staff': 14, 'Secretaries': 15, 'Security staff': 16, 'Waiters/barmen staff': 17}
NAME_HOUSING_TYPE --> {'Co-op apartment': 0, 'House / apartment': 1, 'Municipal apartment': 2, 'Office apartment': 3, 'Rented apartment': 4, 'With parents': 5}
NAME_EDUCATION_TYPE --> {'Academic degree': 0, 'Higher education': 1, 'Incomplete higher': 2, 'Lower secondary': 3, 'Seconda

In [50]:
application_details.head(20)

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,AGE,YEARS_EMPLOYED
2,5008806,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0
3,5008808,0,0,1,0,270000.0,0,4,3,1,1,0,1,1,14,1.0,53.0,9.0
4,5008809,0,0,1,0,270000.0,0,4,3,1,1,0,1,1,14,1.0,53.0,9.0
5,5008810,0,0,1,0,270000.0,0,4,3,1,1,0,1,1,14,1.0,53.0,9.0
6,5008811,0,0,1,0,270000.0,0,4,3,1,1,0,1,1,14,1.0,53.0,9.0
10,5008815,1,1,1,0,270000.0,4,1,1,1,1,1,1,1,0,2.0,47.0,3.0
11,5112956,1,1,1,0,270000.0,4,1,1,1,1,1,1,1,0,2.0,47.0,3.0
12,6153651,1,1,1,0,270000.0,4,1,1,1,1,1,1,1,0,2.0,47.0,3.0
13,5008819,1,1,1,0,135000.0,0,4,1,1,1,0,0,0,8,2.0,49.0,4.0
14,5008820,1,1,1,0,135000.0,0,4,1,1,1,0,0,0,8,2.0,49.0,4.0


In [51]:
credit_data_merged = application_details.merge(credit_records, how='inner', on=['ID'])

In [52]:
credit_data_merged.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,537667.0,5079231.0,42001.999788,5008806.0,5044925.0,5079091.0,5115755.0,5150487.0
CODE_GENDER,537667.0,0.3791101,0.485166,0.0,0.0,0.0,1.0,1.0
FLAG_OWN_CAR,537667.0,0.4304895,0.495145,0.0,0.0,0.0,1.0,1.0
FLAG_OWN_REALTY,537667.0,0.6425371,0.479253,0.0,0.0,1.0,1.0,1.0
CNT_CHILDREN,537667.0,0.5066965,0.787285,0.0,0.0,0.0,1.0,19.0
AMT_INCOME_TOTAL,537667.0,197117.1,104138.963465,27000.0,135000.0,180000.0,229500.0,1575000.0
NAME_INCOME_TYPE,537667.0,2.674957,1.774396,0.0,0.0,4.0,4.0,4.0
NAME_EDUCATION_TYPE,537667.0,3.053723,1.361281,0.0,1.0,4.0,4.0,4.0
NAME_FAMILY_STATUS,537667.0,1.290008,0.867066,0.0,1.0,1.0,1.0,4.0
NAME_HOUSING_TYPE,537667.0,1.308801,0.996071,0.0,1.0,1.0,1.0,5.0


In [53]:
credit_data_merged.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,...,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,AGE,YEARS_EMPLOYED,MONTHS_BALANCE,STATUS,APPROVAL_STATUS
0,5008806,1,1,1,0,112500.0,4,4,1,1,...,0,0,0,16,2.0,59.0,4.0,0,Good_Debt,1.0
1,5008806,1,1,1,0,112500.0,4,4,1,1,...,0,0,0,16,2.0,59.0,4.0,-1,Good_Debt,1.0
2,5008806,1,1,1,0,112500.0,4,4,1,1,...,0,0,0,16,2.0,59.0,4.0,-2,Good_Debt,1.0
3,5008806,1,1,1,0,112500.0,4,4,1,1,...,0,0,0,16,2.0,59.0,4.0,-3,Good_Debt,1.0
4,5008806,1,1,1,0,112500.0,4,4,1,1,...,0,0,0,16,2.0,59.0,4.0,-4,Good_Debt,1.0


In [78]:
credit_data_merged.drop('STATUS', axis=1, inplace=True)


In [79]:
credit_data_merged

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,AGE,YEARS_EMPLOYED,MONTHS_BALANCE,APPROVAL_STATUS
0,5008806,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,0,1.0
1,5008806,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-1,1.0
2,5008806,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-2,1.0
3,5008806,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-3,1.0
4,5008806,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537662,5150337,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-9,1.0
537663,5150337,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-10,
537664,5150337,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-11,
537665,5150337,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-12,1.0


In [80]:
 credit_approval_data = credit_data_merged.drop('ID', axis=1, inplace=False)

In [81]:
credit_approval_data

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,AGE,YEARS_EMPLOYED,MONTHS_BALANCE,APPROVAL_STATUS
0,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,0,1.0
1,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-1,1.0
2,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-2,1.0
3,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-3,1.0
4,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537662,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-9,1.0
537663,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-10,
537664,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-11,
537665,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-12,1.0


# Training , Featuring Credit approval

In [82]:
from sklearn.model_selection import train_test_split

In [83]:
credit_approval_data.isnull().sum()


CODE_GENDER               0
FLAG_OWN_CAR              0
FLAG_OWN_REALTY           0
CNT_CHILDREN              0
AMT_INCOME_TOTAL          0
NAME_INCOME_TYPE          0
NAME_EDUCATION_TYPE       0
NAME_FAMILY_STATUS        0
NAME_HOUSING_TYPE         0
FLAG_MOBIL                0
FLAG_WORK_PHONE           0
FLAG_PHONE                0
FLAG_EMAIL                0
OCCUPATION_TYPE           0
CNT_FAM_MEMBERS           0
AGE                       0
YEARS_EMPLOYED            0
MONTHS_BALANCE            0
APPROVAL_STATUS        6965
dtype: int64

In [84]:
credit_approval_data = credit_approval_data.dropna()

In [85]:
credit_approval_data

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,AGE,YEARS_EMPLOYED,MONTHS_BALANCE,APPROVAL_STATUS
0,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,0,1.0
1,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-1,1.0
2,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-2,1.0
3,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-3,1.0
4,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537660,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-7,1.0
537661,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-8,1.0
537662,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-9,1.0
537665,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-12,1.0


- Out of 1048575 records in credit_records, there are 530702 non-null entries that has matching ID in both application_records and credit_records

In [92]:
X = credit_approval_data.drop('APPROVAL_STATUS', axis=1)
y = credit_approval_data['APPROVAL_STATUS']
credit_approval_data['APPROVAL_STATUS'] = credit_approval_data['APPROVAL_STATUS'].astype('int')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  credit_approval_data['APPROVAL_STATUS'] = credit_approval_data['APPROVAL_STATUS'].astype('int')


In [93]:
credit_approval_data

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,AGE,YEARS_EMPLOYED,MONTHS_BALANCE,APPROVAL_STATUS
0,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,0,1
1,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-1,1
2,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-2,1
3,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-3,1
4,1,1,1,0,112500.0,4,4,1,1,1,0,0,0,16,2.0,59.0,4.0,-4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537660,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-7,1
537661,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-8,1
537662,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-9,1
537665,1,0,1,0,112500.0,4,4,3,4,1,0,0,0,8,1.0,26.0,4.0,-12,1


In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [95]:
from sklearn.tree import DecisionTreeClassifier


In [96]:
DT_model = DecisionTreeClassifier()


In [97]:
DT_model.fit(X_train, y_train)


In [98]:
y_pred = DT_model.predict(X_test)


# Accuracy Score

In [99]:
from sklearn.metrics import accuracy_score

In [101]:
accuracy_score(y_test, y_pred)

0.9971798431012933

**0.997**

# LogisticRegressioin Model

In [61]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()


**Pre-processing data**


In [102]:
# Import MinMaxScaler
from sklearn.preprocessing import  MinMaxScaler

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)

**Train set from LogisticRegression**

In [103]:
logreg.fit(rescaledX_train,y_train)


**Predict and evaluate performance**

In [104]:
# Import confusion_matrix
from sklearn.metrics import confusion_matrix

# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression: ", logreg.score(rescaledX_test, y_test))

# Print the confusion matrix of the logreg model
print(confusion_matrix(y_test, y_pred))


Accuracy of logistic regression classifier:  0.9972112479665349
[[     0    444]
 [     0 158767]]
