In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from IPython.display import clear_output
import pickle
import subprocess

In [2]:
DATA_DIR = "../input"

ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
COLUMN_CHUNKSIZE = 20

TRAIN_NUMERIC = "{0}/train_numeric.csv".format(DATA_DIR)
TRAIN_DATE = "{0}/train_date.csv".format(DATA_DIR)

TEST_NUMERIC = "{0}/test_numeric.csv".format(DATA_DIR)
TEST_DATE = "{0}/test_date.csv".format(DATA_DIR)


In [3]:
#Make groups of columns. Number of columns in a group = COLUMN_CHUNKSIZE
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [None]:
#Get column names and column count
df = pd.read_csv(TRAIN_NUMERIC, nrows=1)
df = df.drop([ID_COLUMN,TARGET_COLUMN],axis = 1)
numeric_columns = df.columns.values
total_columns = len(numeric_columns)
column_groups = list(chunks(numeric_columns,COLUMN_CHUNKSIZE))

#Read file columnwise by column groups, impute and scale data, then write to csv.
#The output files will be merged by 'join.sh' shell script as it is more effective.
i = 0
for column_group in column_groups:

    #Adding Id to column group so the files can be easily merged
    column_group = np.insert(column_group,0,ID_COLUMN, axis=0)
    print(column_group)
    print("Reading..")
    tr = pd.read_csv(TRAIN_NUMERIC, header=0,usecols=column_group)
    te = pd.read_csv(TEST_NUMERIC, header=0,usecols=column_group)
    
    #Preserve ids
    tr_id = tr[ID_COLUMN]
    te_id = te[ID_COLUMN]
    print("Imputing..")
        
    #Fill NaN cells with median values
    imp = Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)
    tr[column_group] = imp.fit_transform(tr[column_group])
    te[column_group] = imp.transform(te[column_group])
        
    #Scale column with robust scaler as it is better if the data contains outliers
    scaler = RobustScaler()
    tr[column_group] = scaler.fit_transform(tr[column_group])
    te[column_group] = scaler.transform(te[column_group])
        
    #Set Ids 
    tr[ID_COLUMN] = tr_id
    te[ID_COLUMN] = te_id
        
    #Write the transformed data to csv
    path = '_'+str(i)+'.csv'            
    print("Writing to file: {0}".format(path))
    tr.to_csv('../workdir/train_num'+ path, header=True, index=False)
    te.to_csv('../workdir/test_num'+ path, header = True, index = False)            
    i = i + 1
    clear_output()
    print("{0} percent".format(i/(total_columns/COLUMN_CHUNKSIZE)*100))
print("Finished")

101.2396694214876 percent
Finished


In [None]:
df = pd.read_csv(TRAIN_NUMERIC, header=0,usecols=[ID_COLUMN,TARGET_COLUMN])
df.to_csv('../workdir/train_num_RESPONSE.csv', header=True, index=False)
subprocess.call('./joiner.sh')
subprocess.call('./joiner2.sh')