In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from IPython.display import clear_output
import pickle


# For .read_csv, always use header=0 when you know row 0 is the header row
#Selecting first n rows of numerical data

In [2]:
def normalizeNone(val):
    if np.isnan(val):
        return 0
    else:
        return val

def normalize_data(dataFrame):
    for column in dataFrame:
        dataFrame[column] = dataFrame[column].fillna(normalizeNone(dataFrame[column].dropna().median()))
    return dataFrame

def normalize_data_w_medians(dataFrame,medians):
    for column in dataFrame:
        dataFrame[column] = dataFrame[column].fillna(normalizeNone(medians[column].dropna().mean()))
    return dataFrame

def findNa(data_frame,chunknum):    
    median_row={}
    for column in data_frame:    
        median_row[column] = normalizeNone(data_frame[column].dropna().median())
    return median_row
    
def addNanDummies(data_frame):
    for column in data_frame:
        data_frame[column+"_isNan"]=data_frame[column].map(lambda x: 1 if np.isnan(x) else 0)
    return data_frame

def convertToBinary(data_frame):
    for column in data_frame:
        if(column != 'Id'):
            data_frame[column]=data_frame[column].map(lambda x: 1 if np.isnan(x) else 0)
    return data_frame

def dateColumnToNumerical(column_name):
    columnParts = column_name.split('_')
    lastPart = columnParts[2].split('D')
    columnParts[2] = 'F'+str(int(lastPart[1])-1)
    return '_'.join(columnParts)

def numColumnToDate(column_name):
    columnParts = column_name.split('_')
    lastPart = columnParts[2].split('F')
    columnParts[2] = 'D'+str(int(lastPart[1])+1)
    return '_'.join(columnParts)

In [None]:
filename='../input/train_date.csv'
chunknum = 0
for chunk in pd.read_csv(filename, header=0, chunksize=200000):
    chunk = convertToBinary(chunk)
    with open('../train_date_bin.csv', 'a') as f:
        if(chunknum == 0):
            chunk.to_csv(f, header=True, index=False)
        else:
            chunk.to_csv(f, header=False, index=False)
    chunknum = chunknum +1
    clear_output()
    print("%d percent" % int((chunknum*100000/1183748)*100))

In [None]:
#Merging the dataframes as they should be treated together
#df = pd.concat([df_num,df_date,df_cat])
#df_test = pd.concat([df_num_test,df_date_test,df_cat_test])
#df = df_num
#df_test = df_num_test

print("Finding medians of columns in chunks...")
medians=[]

filename='../input/train_numeric.csv'
chunknum = 0
for chunk in pd.read_csv(filename, header=0, chunksize=200000):
    #chunk = addNanDummies(chunk)
    median_chunk = findNa(chunk,chunknum)
    medians.append(median_chunk)
    chunknum = chunknum +1
    clear_output()
    print("%d percent" % int((chunknum*200000/1183748)*100))
    
medians = pd.DataFrame(medians)
pickle.dump( medians, open( "../medians.p", "wb" ) )

In [None]:
'''
filename='../input/train_categorical.csv'
chunknum = 1
columns = set()
for chunk in pd.read_csv(filename, header=0, chunksize=10000):
    for column in chunk.drop(['Id'],axis=1):
        uniqueVals = chunk[column].dropna().unique()
        for val in uniqueVals:
            columns.add(val)
    print("%d percent" % int((chunknum*10000/1183748)*100))
    chunknum = chunknum +1
columns
'''

In [4]:
medians = pickle.load( open( "../medians.p", "rb" ) )

In [None]:
#Train
models=[]
def train(df):
    print("Filling missing data with median values...")
    df = normalize_data_w_medians(df,medians)
    df['Id2'] = df['Id'] 
    print("Preparing data for training...")
    X = df.drop(['Id','Response'],axis=1)
    #X = X.drop(['Id_isNan','Response_isNan'],axis=1)
    y = df['Response']
    
    # Random Forests 
    model = RandomForestClassifier(n_estimators=100)
    print("Training...")
    model.fit(X,y)
    models.append(model)

print("Reading train data into memory in chunks...")
chunknum = 0
for chunk in pd.read_csv('../input/train_numeric.csv', header=0, chunksize=200000):
    #print("Adding dummy columns based on missing data...")
    #chunk = addNanDummies(chunk)
    train(chunk)
    chunknum = chunknum +1
    clear_output()
    print("%d percent" % int((chunknum*200000/1183748)*100))
pickle.dump( models, open( "../models.p", "wb" ) )

In [3]:
models = pickle.load( open( "../models.p", "rb" ) )

In [5]:
#Predict and write to file

submissions = []

def process(df_test,firstChunk,models):
    df_test = normalize_data_w_medians(df_test,medians)
    df_test['Id2'] = df_test['Id']
    X_test = df_test.drop(['Id'],axis=1)
    #X_test = X_test.drop(['Id_isNan'],axis=1)
    for model in models:
        y_pred = model.predict(X_test)
        submission = pd.DataFrame({
            "Id":df_test['Id'],
            "Response":y_pred
        })
        sub_index=models.index(model)
        print(sub_index/len(models))
        if(firstChunk):
            submissions.append(submission)
        else:
            submissions[sub_index]=pd.concat([submissions[sub_index],submission])

print("Evaluating test data...")
        
filename='../input/test_numeric.csv'
chunknum = 0
for chunk in pd.read_csv(filename, header=0, chunksize=100000):
    #chunk = addNanDummies(chunk)
    process(chunk, chunknum == 0,models)
    chunknum = chunknum +1
    clear_output()
    print("%d percent" % int((chunknum*100000/1183748)*100))

101 percent


In [6]:
pickle.dump( submissions, open( "../submissions.p", "wb" ) )

In [10]:
i=0
column_names=[]
for df in submissions:
    column_name = 'Response_'+str(i)
    column_names.append(column_name)
    df.columns=['Id',column_name]
    i = i+1
    
final_submission = submissions[0]
for i in range(1,len(submissions)):
    final_submission = pd.merge(final_submission,submissions[i], on='Id', how='outer')

final_submission['Response'] = final_submission[column_names].mean(axis=1)
final_submission['Response'] = final_submission['Response'].map(lambda x: int(1) if x>0 else int(0))
final_submission= final_submission.drop(column_names,axis=1)

with open('../submission.csv', 'a') as f:
    final_submission.to_csv(f, header=True, index=False)