## Data Cleaning + Feature Engineering

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('cleaned_train.csv')
print(df.shape)

(263419, 152)


#### Cleaning

In [3]:
df = pd.read_csv('application_train_clean.csv')

# drop unnecessary columns
df = df.drop(['Unnamed: 0','SK_ID_CURR'],axis=1)
print(df.columns[df.dtypes == object])

# one hot encoding
df = pd.get_dummies(df, columns = df.select_dtypes(include=['object']).columns, drop_first=True, dtype=int)

Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'WEEKDAY_APPR_PROCESS_START',
       'ORGANIZATION_TYPE'],
      dtype='object')


#### Feature Engineering

In [4]:
df['DEBT_INC_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']

df['DAYS_BIRTH'] = df['DAYS_BIRTH'] * -1
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'] * -1
df['DAYS_REGISTRATION'] = df['DAYS_REGISTRATION'] * -1
df['DAYS_ID_PUBLISH'] = df['DAYS_ID_PUBLISH'] * -1

df['AGE_YEARS'] = df['DAYS_BIRTH'] / 365
df['EMPLOYED_YEARS'] = df['DAYS_EMPLOYED'] / 365

df['AGE_EMPLOYMENT_INTERACTION'] = df['AGE_YEARS'] * df['EMPLOYED_YEARS']
#df['INC_CREDIT_INTERACTION'] = df['AMT_INCOME_TOTAL'] * df['AMT_CREDIT']

df.head()

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA,DEBT_INC_RATIO,INCOME_PER_PERSON,ANNUITY_INCOME_RATIO,AGE_YEARS,EMPLOYED_YEARS,AGE_EMPLOYMENT_INTERACTION
0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,9461,637,3648,...,0,0,0,0,2.007889,202500.0,0.121978,25.920548,1.745205,45.236682
1,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,16765,1188,1186,...,0,0,0,0,4.79075,135000.0,0.132217,45.931507,3.254795,149.497617
2,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,19046,225,4260,...,0,0,0,0,2.0,67500.0,0.1,52.180822,0.616438,32.16626
3,0,0,121500.0,513000.0,21865.5,513000.0,0.028663,19932,3038,4311,...,0,0,0,0,4.222222,121500.0,0.179963,54.608219,8.323288,454.519917
4,0,0,99000.0,490495.5,27517.5,454500.0,0.035792,16941,1588,4970,...,0,0,0,0,4.9545,49500.0,0.277955,46.413699,4.350685,201.931379


In [5]:
df.to_csv('cleaned_train_with_features.csv', index=False)