# Run this to create preprocessed_train.csv and preprocessed_test.csv
### Results will be placed in the same location as application_train.csv and application_test.csv

In [1]:
#imports
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

from PathFinder import find_path

In [2]:
#read data
PATH = find_path()

#select files
files = ['application_train.csv','application_test.csv',]
"""         
          'bureau.csv', 
         'bureau_balance.csv', 'credit_card_balance.csv',
         'installments_payments.csv', 'POS_CASH_balance.csv',
         'previous_application.csv']
"""
data_files=[]
data=[]
print("files used: {}".format(files))

#actually read files
for file in files:
    stuff = pd.read_csv(PATH+file,header=0)
    #print(stuff)
    data_files.append(file)
    data.append(stuff)

path: data/
all files in directory: ['application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'HomeCredit_columns_description.csv', 'installments_payments.csv', 'POS_CASH_balance.csv', 'preprocessed_test.csv', 'preprocessed_train.csv', 'previous_application.csv', 'sample_submission.csv']
files used: ['application_train.csv', 'application_test.csv']


In [3]:
#select and convert columns
original_train = data[0] #save the original because we'll need some stuff later
original_test = data[1]
app_train = data[0].copy()
app_test = data[1].copy()

#delete ID cols
del app_train['SK_ID_CURR']
del app_test['SK_ID_CURR']

int64cols = app_train.select_dtypes('int64').columns.values.tolist()
objectcols = app_train.select_dtypes('object').columns.values.tolist()
float64cols = app_train.select_dtypes('float64').columns.values.tolist()
#int64cols.remove("SK_ID_CURR")
#print('int cols: {}\n'.format(int64cols))
#print('dbl cols: {}\n'.format(float64cols))
#print('obj cols: {}\n'.format(objectcols))
#print('obj cols in detail:')
#for col in objectcols:
#    print('{}: {}'.format(col, data[0][col].unique()))
#print('\n')

#label encoding for unique <= 2
le = LabelEncoder()
le_count = 0
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            #app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

#one hot encoding
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)
#print('new cols: {}'.format(app_train.columns.values.tolist())) #holy crap thats a lot of bs columns

app_train_y = app_train['TARGET']
new_cols_x = app_train.columns.values.tolist()
new_cols_x.remove('TARGET')
app_train_x = app_train[new_cols_x]

print('Training Features shape: ', app_train_x.shape) #241 cols
print('Testing Features shape: ', app_test.shape) #241 cols

3 columns were label encoded.
Training Features shape:  (307511, 241)
Testing Features shape:  (48744, 241)


In [4]:
# Align the training and testing data, keep only columns present in both dataframes
app_train_x, app_test = app_train_x.align(app_test, join = 'inner', axis = 1)

new_cols_x_2 = app_train_x.columns.values.tolist()
#print('input columns:', new_cols_x_2)

print('Training Features shape: ', app_train_x.shape) #2435 cols
print('Testing Features shape: ', app_test.shape) #235 cols

Training Features shape:  (307511, 235)
Testing Features shape:  (48744, 235)


In [5]:
#impute missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = 'median')

app_train_x = imputer.fit_transform(app_train_x)
app_test = imputer.transform(app_test)

In [6]:
#scale values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (0, 1))
scaler.fit(app_train_x)
app_train_x = scaler.transform(app_train_x)
app_test = scaler.transform(app_test)

In [7]:
train = pd.DataFrame(data=app_train_x, columns=new_cols_x_2)
train['TARGET'] = app_train_y
train['SK_ID_CURR'] = original_train['SK_ID_CURR']
train.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,TARGET,SK_ID_CURR
0,0.0,0.001512,0.090287,0.090032,0.077441,0.256321,0.888839,0.045086,0.85214,0.705433,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,100002
1,0.0,0.002089,0.311736,0.132924,0.271605,0.045016,0.477114,0.043648,0.951929,0.959566,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,100003
2,0.0,0.000358,0.022472,0.020025,0.023569,0.134897,0.348534,0.046161,0.827335,0.648326,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,100004
3,0.0,0.000935,0.066837,0.109477,0.063973,0.107023,0.350846,0.038817,0.601451,0.661387,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,100006
4,0.0,0.000819,0.116854,0.078975,0.117845,0.39288,0.298591,0.03882,0.825268,0.519522,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,100007


In [8]:
overwrite = True
if overwrite:
    train.to_csv(PATH+'preprocessed_train.csv', index = False)

In [9]:
test = pd.DataFrame(data=app_test, columns=new_cols_x_2)
test['SK_ID_CURR'] = original_test['SK_ID_CURR']
test.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,SK_ID_CURR
0,0.0,0.000935,0.130787,0.073886,0.102132,0.257,0.337542,0.04067,0.790451,0.887175,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,100001
1,0.0,0.000627,0.044387,0.061443,0.034792,0.491595,0.40389,0.035085,0.630431,0.774489,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100005
2,0.0,0.001512,0.154373,0.26583,0.147026,0.260475,0.292616,0.035114,0.911843,0.513269,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100013
3,0.105263,0.002474,0.382022,0.184872,0.382716,0.361433,0.634329,0.041879,0.918936,0.415312,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,100028
4,0.052632,0.00132,0.144944,0.118761,0.145903,0.134897,0.687091,0.04103,0.837873,0.407809,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100038


In [10]:
overwrite = True
if overwrite:
    test.to_csv(PATH+'preprocessed_test.csv', index = False)