## Data Cleaning + Feature Engineering

In [7]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [8]:
df = pd.read_csv('cleaned_train.csv')
print(df.shape)

(263419, 152)


#### Cleaning

In [9]:
df = pd.read_csv('application_train_clean.csv')

# drop unnecessary columns
df = df.drop(['Unnamed: 0','SK_ID_CURR'],axis=1)
print(df.columns[df.dtypes == object])

# one hot encoding
df = pd.get_dummies(df, columns = df.select_dtypes(include=['object']).columns, drop_first=True, dtype=int)

Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'WEEKDAY_APPR_PROCESS_START',
       'ORGANIZATION_TYPE'],
      dtype='object')


#### Feature Engineering

In [10]:
df['DEBT_INC_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']

df['DAYS_BIRTH'] = df['DAYS_BIRTH'] * -1
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'] * -1
df['DAYS_REGISTRATION'] = df['DAYS_REGISTRATION'] * -1
df['DAYS_ID_PUBLISH'] = df['DAYS_ID_PUBLISH'] * -1

df['AGE_YEARS'] = df['DAYS_BIRTH'] / 365
df['EMPLOYED_YEARS'] = df['DAYS_EMPLOYED'] / 365

df['AGE_EMPLOYMENT_INTERACTION'] = df['AGE_YEARS'] * df['EMPLOYED_YEARS']
#df['INC_CREDIT_INTERACTION'] = df['AMT_INCOME_TOTAL'] * df['AMT_CREDIT']

df.head()

non_binary_columns = [col for col in df.columns if df[col].nunique() > 2]

scaler = MinMaxScaler()
df[non_binary_columns] = scaler.fit_transform(df[non_binary_columns])

In [11]:
df.to_csv('cleaned_train_with_features.csv', index=False)