In [55]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression, LogisticRegression
import warnings
warnings.filterwarnings('ignore')
import pprint
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

## 1. DATA PREPERATION

In [56]:
proj_data = pd.read_csv('application_data.csv')
proj_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB


### DROPPING COLUMNS


In [57]:
columns_dropped = ['NAME_CONTRACT_TYPE','CODE_GENDER','CNT_CHILDREN','NAME_TYPE_SUITE','FLAG_MOBIL','FLAG_EMP_PHONE','FLAG_WORK_PHONE','FLAG_CONT_MOBILE','FLAG_PHONE','FLAG_EMAIL','WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START','REG_REGION_NOT_LIVE_REGION','REG_REGION_NOT_WORK_REGION','LIVE_REGION_NOT_WORK_REGION','REG_CITY_NOT_LIVE_CITY','REG_CITY_NOT_WORK_CITY','LIVE_CITY_NOT_WORK_CITY','ORGANIZATION_TYPE','APARTMENTS_AVG','BASEMENTAREA_AVG','YEARS_BEGINEXPLUATATION_AVG','YEARS_BUILD_AVG','COMMONAREA_AVG','ELEVATORS_AVG','ENTRANCES_AVG','FLOORSMAX_AVG','FLOORSMIN_AVG','LANDAREA_AVG','LIVINGAPARTMENTS_AVG','LIVINGAREA_AVG','NONLIVINGAPARTMENTS_AVG','NONLIVINGAREA_AVG','APARTMENTS_MODE','BASEMENTAREA_MODE','YEARS_BEGINEXPLUATATION_MODE','YEARS_BUILD_MODE','COMMONAREA_MODE','ELEVATORS_MODE','ENTRANCES_MODE','FLOORSMAX_MODE','FLOORSMIN_MODE','LANDAREA_MODE','LIVINGAPARTMENTS_MODE','LIVINGAREA_MODE','NONLIVINGAPARTMENTS_MODE','NONLIVINGAREA_MODE','APARTMENTS_MEDI','BASEMENTAREA_MEDI','YEARS_BEGINEXPLUATATION_MEDI','YEARS_BUILD_MEDI','COMMONAREA_MEDI','ELEVATORS_MEDI','ENTRANCES_MEDI','FLOORSMAX_MEDI','FLOORSMIN_MEDI','LANDAREA_MEDI','LIVINGAPARTMENTS_MEDI','LIVINGAREA_MEDI','NONLIVINGAPARTMENTS_MEDI','NONLIVINGAREA_MEDI','FONDKAPREMONT_MODE','HOUSETYPE_MODE','TOTALAREA_MODE','WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE','DAYS_LAST_PHONE_CHANGE','FLAG_DOCUMENT_2','FLAG_DOCUMENT_3','FLAG_DOCUMENT_4','FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7','FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10','FLAG_DOCUMENT_11','FLAG_DOCUMENT_12','FLAG_DOCUMENT_13','FLAG_DOCUMENT_14','FLAG_DOCUMENT_15','FLAG_DOCUMENT_16','FLAG_DOCUMENT_17','FLAG_DOCUMENT_18','FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21','AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT','AMT_REQ_CREDIT_BUREAU_YEAR']

processed_data = proj_data.drop(columns_dropped, axis=1)
processed_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 29 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   SK_ID_CURR                   307511 non-null  int64  
 1   TARGET                       307511 non-null  int64  
 2   FLAG_OWN_CAR                 307511 non-null  object 
 3   FLAG_OWN_REALTY              307511 non-null  object 
 4   AMT_INCOME_TOTAL             307511 non-null  float64
 5   AMT_CREDIT                   307511 non-null  float64
 6   AMT_ANNUITY                  307499 non-null  float64
 7   AMT_GOODS_PRICE              307233 non-null  float64
 8   NAME_INCOME_TYPE             307511 non-null  object 
 9   NAME_EDUCATION_TYPE          307511 non-null  object 
 10  NAME_FAMILY_STATUS           307511 non-null  object 
 11  NAME_HOUSING_TYPE            307511 non-null  object 
 12  REGION_POPULATION_RELATIVE   307511 non-null  float64
 13 

### DATA CLEANING 

#### Reformat to AGE

In [58]:
# Data Cleaning
# Check for null values
processed_data['AGE'] = processed_data['DAYS_BIRTH']/-365
processed_data['AGE'] = processed_data['AGE'].astype(int)


### FILLING NA DATA

In [64]:
numerical_cols = processed_data.select_dtypes(include=['float', 'int']).columns
processed_data[numerical_cols] = processed_data[numerical_cols].fillna(processed_data[numerical_cols].median())
processed_data.fillna('Missing', inplace=True)
processed_data.describe()


Unnamed: 0,SK_ID_CURR,TARGET,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,AGE
count,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0
mean,278180.518577,0.080729,168797.9,599026.0,27108.487841,538316.3,0.020868,-16036.995067,63815.045904,-4986.120328,...,2.052463,2.031521,0.504311,0.5145034,0.515695,1.417523,0.142944,1.400626,0.099717,43.435968
std,102790.175348,0.272419,237123.1,402490.8,14493.461065,369289.0,0.013831,4363.988632,141275.766519,3522.886321,...,0.509034,0.502737,0.139408,0.1908699,0.174736,2.398395,0.446033,2.377224,0.361735,11.954593
min,100002.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,-24672.0,...,1.0,1.0,0.014568,8.173617e-08,0.000527,0.0,0.0,0.0,0.0,20.0
25%,189145.5,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,-7479.5,...,2.0,2.0,0.505998,0.3929737,0.4171,0.0,0.0,0.0,0.0,34.0
50%,278202.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,-4504.0,...,2.0,2.0,0.505998,0.5659614,0.535276,0.0,0.0,0.0,0.0,43.0
75%,367142.5,0.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,-2010.0,...,2.0,2.0,0.505998,0.6634218,0.636376,2.0,0.0,2.0,0.0,53.0
max,456255.0,1.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,365243.0,0.0,...,3.0,3.0,0.962693,0.8549997,0.89601,348.0,34.0,344.0,24.0,69.0


In [66]:
processed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 30 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   SK_ID_CURR                   307511 non-null  int64  
 1   TARGET                       307511 non-null  int64  
 2   FLAG_OWN_CAR                 307511 non-null  object 
 3   FLAG_OWN_REALTY              307511 non-null  object 
 4   AMT_INCOME_TOTAL             307511 non-null  float64
 5   AMT_CREDIT                   307511 non-null  float64
 6   AMT_ANNUITY                  307511 non-null  float64
 7   AMT_GOODS_PRICE              307511 non-null  float64
 8   NAME_INCOME_TYPE             307511 non-null  object 
 9   NAME_EDUCATION_TYPE          307511 non-null  object 
 10  NAME_FAMILY_STATUS           307511 non-null  object 
 11  NAME_HOUSING_TYPE            307511 non-null  object 
 12  REGION_POPULATION_RELATIVE   307511 non-null  float64
 13 

### EXPORT TO CSV

In [67]:
processed_data.to_csv("processed_data.csv")