In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np

import csv
import os
import sys

from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.externals import joblib
from sklearn.metrics import log_loss
import xgboost as xgb

import matplotlib.pyplot as plt
import mca

In [2]:
print('Load data...')
train = pd.read_csv('data/train.csv')
target = train['target']
train = train.drop(['ID','target'],axis=1)
test = pd.read_csv('data/test.csv')
ids = test['ID'].values
test = test.drop(['ID'],axis=1)

Load data...


In [3]:
def dummify(name,series):
    prefix_string = name + '_'
    dummies = pd.get_dummies(series,prefix=prefix_string)
    dummy_column_names = dummies.columns.values
    #Get a list of all rows containing nulls. After dummifying these rows will just have all zeros for dummy variable
    get_nulls = np.where(series.isnull() == True)[0].tolist()

    return dummies, dummy_column_names, get_nulls

In [31]:
columns_mca = ['v107','v91']
mca_df = mca.mca(train[columns_mca].iloc[0:500],columns_mca)
print mca_df.fs_r(1)
mca_df.fs_r(1).shape

[[ 0.71740692 -0.11085994 -0.36322171 -0.5350364   0.58393567  3.39419876]
 [ 0.71740692 -0.7341013  -1.77000934  2.50179662  2.31945524 -1.54817725]
 [ 0.71740692  0.20408484  0.10705424 -2.94478259  0.17094468 -1.8055955 ]
 ..., 
 [ 0.71740692  0.20408484  0.10705424 -2.94478259  0.17094468 -1.8055955 ]
 [ 0.71740692  0.20408484  0.10705424 -2.94478259  0.17094468 -1.8055955 ]
 [ 0.71740692 -0.7341013  -1.77000934  2.50179662  2.31945524 -1.54817725]]


(500, 6)

In [74]:
corr = train.corr()

to_drop = set()
# for col in corr.columns.values:
#     list_correlated = corr[col][(corr[col] > 0.9) & (corr[col] < 1)].index.tolist()
#     if len(list_correlated) > 0:
#         sys.stdout.write(col + ": ")
#         print list_correlated
for col in corr.columns.values:
    if col in to_drop:
        continue

    col_list = corr[col][(corr[col] > 0.9) & (corr[col] < 1)].index.tolist()
    col_set = set(col_list)
    col_set.difference_update(to_drop)
    if (len(col_list) == 0) or (len(col_set) == 0):
        continue

    col_list.append(col)
    lowest_na_count = train[col_list[0]].isnull().sum()
    best_col = col_list[0]
    for option in col_list:
        na_count = train[option].isnull().sum()
        if na_count < lowest_na_count:
            lowest_na_count = na_count
            best_col = option
    col_list.remove(best_col)
    print 'dropping = ' + str(col_list)
    print 'keeping = ' + str(best_col)
    to_drop.update(col_list)


220
112
v8: ['v25', 'v46', 'v63', 'v105']
v10: ['v12']
v11: ['v53']
v12: ['v10']
v13: ['v104']
v15: ['v32', 'v73']
v17: ['v64', 'v76']
v20: ['v65']
v25: ['v8', 'v46', 'v54', 'v63', 'v89', 'v105']
v26: ['v43', 'v60']
v29: ['v41', 'v67', 'v77', 'v96']
v32: ['v15', 'v73', 'v86']
v33: ['v55', 'v83', 'v111', 'v121']
v34: ['v114']
v41: ['v29', 'v49', 'v67', 'v96']
v43: ['v26', 'v116']
v46: ['v8', 'v25', 'v54', 'v63', 'v89', 'v105']
v49: ['v41']
v53: ['v11']
v54: ['v25', 'v46', 'v63', 'v89', 'v105']
v55: ['v33', 'v83']
v60: ['v26']
v63: ['v8', 'v25', 'v46', 'v54', 'v89', 'v105']
v64: ['v17', 'v76', 'v106']
v65: ['v20']
v67: ['v29', 'v41', 'v77']
v73: ['v15', 'v32']
v76: ['v17', 'v64']
v77: ['v29', 'v67']
v83: ['v33', 'v55', 'v111', 'v121']
v86: ['v32']
v89: ['v25', 'v46', 'v54', 'v63', 'v105']
v92: ['v95']
v95: ['v92']
v96: ['v29', 'v41']
v97: ['v118']
v104: ['v13']
v105: ['v8', 'v25', 'v46', 'v54', 'v63', 'v89']
v106: ['v64']
v108: ['v128']
v109: ['v128']
v111: ['v33', 'v83']
v114: ['v34']
v

None
0


In [5]:
print('Clearing...')

drop_correlated = True

train_dummy_columns = {}
train_nulls_dict = {}
test_dummy_columns = {}
test_nulls_dict = {}


cleaned_train = train.copy()
cleaned_test = test.copy()
if drop_correlated = True:
    cleaned_train.drop(list(to_drop),axis=1, inplace = True)
    cleaned_test.drop(list(to_drop),axis=1, inplace = True)

for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_name == 'v22':
        #v22 has too many options to dummify, instead: factorize
        cleaned_train[train_name], tmp_indexer = pd.factorize(train[train_name])
        cleaned_test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)    
    elif train_series.dtype == 'O':
        print 'Dummifying ' + train_name
        cleaned_train.drop(train_name,axis=1,inplace = True)
        cleaned_test.drop(train_name,axis=1,inplace = True)
        
        train_dummies, train_dummy_list, train_null_list = dummify(train_name,train_series)
        test_dummies, test_dummy_list, test_null_list = dummify(test_name,test_series)

        cleaned_train = pd.concat([cleaned_train,train_dummies], axis = 1)
        cleaned_test = pd.concat([cleaned_test,test_dummies], axis = 1)

        train_dummy_columns[train_name] = train_dummy_list
        train_nulls_dict[train_name] = train_null_list
        test_dummy_columns[test_name] = test_dummy_list
        test_nulls_dict[test_name] = test_null_list
        
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            cleaned_train.loc[train_series.isnull(), train_name] = train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            cleaned_test.loc[test_series.isnull(), test_name] = train_series.mean()  #TODO

Clearing...
Dummifying v3
Dummifying v24
Dummifying v30
Dummifying v31
Dummifying v47
Dummifying v52
Dummifying v56
Dummifying v66
Dummifying v71
Dummifying v74
Dummifying v75
Dummifying v79
Dummifying v91
Dummifying v107
Dummifying v110
Dummifying v112
Dummifying v113
Dummifying v125


[[ 2.50645476  0.9426352   1.13715451 -0.18280678  0.70024693  1.80532343]
 [ 0.71285863 -0.36968589 -0.35129349 -2.04263397  0.70024693 -3.26952334]
 [-1.51327043 -1.67259667 -2.25775578 -0.59779855  0.70024693  1.84863394]
 ..., 
 [-1.51327043 -1.67259667 -2.25775578 -0.59779855  0.70024693  1.84863394]
 [-1.51327043 -1.67259667 -2.25775578 -0.59779855  0.70024693  1.84863394]
 [ 2.50645476  0.9426352   1.13715451 -0.18280678  0.70024693  1.80532343]]


(5000, 6)

['E' 'B' 'C' 'D' 'A' 'F' 'G' nan]
['A' 'B' 'G' 'C' 'F' 'E' 'D' nan]
