In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from IPython.display import clear_output
import pickle

#common utilities for transforming data
from commons import Utilities

In [2]:
DATA_DIR = "../input"

ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
CHUNKSIZE = 50000
COLUMN_CHUNKSIZE = 40

TRAIN_NUMERIC = "{0}/train_numeric.csv".format(DATA_DIR)
TRAIN_DATE = "{0}/train_date.csv".format(DATA_DIR)

TEST_NUMERIC = "{0}/test_numeric.csv".format(DATA_DIR)
TEST_DATE = "{0}/test_date.csv".format(DATA_DIR)


In [3]:
#Get column names and column count
df = pd.read_csv(TRAIN_NUMERIC, nrows=1)
df = df.drop(ID_COLUMN,axis = 1)
numeric_columns = df.columns.values
total_columns = len(numeric_columns)

In [4]:
#Make groups of columns. Number of columns in a group = COLUMN_CHUNKSIZE
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
        
column_groups = list(chunks(numeric_columns,COLUMN_CHUNKSIZE))

In [None]:
#Read file columnwise by column groups, impute and scale data, then write to csv.
#The output files will be merged by 'join.sh' shell script as it is more effective.
i = 0
for column_group in column_groups:
    #Adding Id to column group so the files can be easily merged
    column_group = np.insert(column_group,0,'Id', axis=0)
    print("Reading..")
    df = pd.read_csv(TRAIN_NUMERIC, header=0,usecols=column_group)
    print("Imputing..")
    #Fill NaN cells with mean values
    imp = Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
    df[column_group] = imp.fit_transform(df[column_group])
    #Scale column with robust scaler as it is better if the data contains outliers
    scaler = RobustScaler()
    df[column_group] = scaler.fit_transform(df[column_group])
    #Write the transformed data to csv
    path = '../train_num_'+str(i)+'.csv'
    print("Writing to file: {0}".format(path))
    df.to_csv(path, header=True, index=False)
    i = i + 1
    clear_output()
    print("{0} percent".format(i/(total_columns/COLUMN_CHUNKSIZE)*100))
print("Finished")

Reading..
Imputing..
Writing to file: ../train_num_0.csv
