In [3]:
%matplotlib notebook
import pandas as pd
import numpy as np
import math
from sklearn.externals import joblib
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import cross_val_score

from sklearn import decomposition, linear_model, grid_search, metrics, ensemble

In [4]:
#read in data
train_df = pd.read_csv('data/train.csv')

In [5]:
#Create a training dataset. Features (x) should include all columns except target and ID. Target (y) should only 
#include the target column
train_x = train_df.drop(['target','ID'], axis=1)
train_x.head()
train_y = train_df.target

In [6]:
# Find which feaetures have categorical values (dtype = object) and which have continuous values (dtype = float)
# Save the names of categorical feature columns to categorical_columns
g = train_x.columns.to_series().groupby(train_x.dtypes).groups 
g_keys = g.keys()
categorical_columns = g[g_keys[0]]
# categorical_columns

In [7]:
# Calculate the mean of the target variable to look for class imbalance. The mean is ~ 0.76 so ~76% of data is 
# target value 1
train_df.target.mean()

0.76119872989214576

In [8]:
# Check the number of values that each categorical feature can have
for variable in categorical_columns:
    print variable + " " +  str(len(train_df[variable].unique()))

v3 4
v22 18211
v24 5
v30 8
v31 4
v47 10
v52 13
v56 123
v66 3
v71 9
v74 3
v75 4
v79 18
v91 8
v107 8
v110 3
v112 23
v113 37
v125 91


## Variable 22 does not seem useable. It has 18,211 unique categorical variables. Going to drop it unless I find a better way to work with it

In [9]:
# remove variable 22
train_x.drop('v22',axis = 1, inplace = True)
g = train_x.columns.to_series().groupby(train_x.dtypes).groups 
g_keys = g.keys()
categorical_columns = g[g_keys[0]]


In [10]:
# Define a function, dummify, that will replace categorical features with dummy columns. Return the new dataset,
# the names of the dummy columns, and the rows with null values for each categorical variable
def dummify(column_name,dataset):
    prefix_string = column_name + '_'
    dummies = pd.get_dummies(train_x[column_name],prefix=prefix_string)
    dummy_column_names = dummies.columns.values
    #Get a list of all rows containing nulls. After dummifying these rows will just have all zeros for dummy variable
    get_nulls = np.where(dataset[column_name].isnull() == True)[0].tolist()

    dataset.drop(column_name, axis = 1, inplace = True)

    return pd.concat([dataset,dummies], axis = 1), dummy_column_names, get_nulls



In [11]:
# Iterate through each categorical variable. On each iteration, call dummify for that variable and load
# column names and null list into a dictionary
dummified_train_x = train_x.copy()
dummy_columns = {}
nulls_dict = {}
for column in categorical_columns:
    print 'Dummifying ' + column
    dummified_train_x,temp_dummy_list, temp_null_list = dummify(column,dummified_train_x)
    dummy_columns[column] = temp_dummy_list
    nulls_dict[column] = temp_null_list
    

# Check which categorical variables have missing (null) values that will need to be filled, track those features in
# variables_to_predict
variables_to_predict = []
for key in nulls_dict:
    print key + ' ' + str(len(nulls_dict[key]))
    if len(nulls_dict[key]) > 0:
        variables_to_predict.append(key)

Dummifying v3
Dummifying v24
Dummifying v30
Dummifying v31
Dummifying v47
Dummifying v52
Dummifying v56
Dummifying v66
Dummifying v71
Dummifying v74
Dummifying v75
Dummifying v79
Dummifying v91
Dummifying v107
Dummifying v110
Dummifying v112
Dummifying v113
Dummifying v125
v30 60110
v31 3457
v91 3
v79 0
v24 0
v74 0
v66 0
v110 0
v71 0
v112 382
v113 55304
v56 6882
v107 3
v3 3457
v52 3
v47 0
v75 0
v125 77


In [12]:
# Print a sample row to see dummified features are correctly in place
with pd.option_context('display.max_rows', 999, 'display.max_columns', 3):
    print dummified_train_x.iloc[0]

v1          1.335739e+00
v2          8.727474e+00
v4          3.921026e+00
v5          7.915266e+00
v6          2.599278e+00
v7          3.176895e+00
v8          1.294147e-02
v9          9.999999e+00
v10         5.032815e-01
v11         1.643411e+01
v12         6.085711e+00
v13         2.866830e+00
v14         1.163639e+01
v15         1.355013e+00
v16         8.571429e+00
v17         3.670350e+00
v18         1.067204e-01
v19         1.488831e-01
v20         1.886928e+01
v21         7.730923e+00
v23        -1.716131e-08
v25         1.394116e-01
v26         1.720818e+00
v27         3.393503e+00
v28         5.901219e-01
v29         8.880867e+00
v32         1.083033e+00
v33         1.010829e+00
v34         7.270147e+00
v35         8.375452e+00
v36         1.132659e+01
v37         4.545457e-01
v38         0.000000e+00
v39         4.012088e+00
v40         7.711453e+00
v41         7.653429e+00
v42         1.270758e+01
v43         2.015505e+00
v44         1.049834e+01
v45         9.848672e+00


In [13]:
# Define a function to predict missing categorical values
# Impute the mean for missing continuous variables
# May change in the future to also predict continuous variables
def predict_values(column_name,dataset,nulls_list, variable, force=False):

    dataset_test = dataset.iloc[nulls_list]
    dataset_test_target = dataset_test[column_name]
    dataset_test_features = dataset_test.drop(column_name,axis=1)

    dataset_train = dataset.drop(nulls_list)
    dataset_train_target = dataset_train[column_name]
    dataset_train_features = dataset_train.drop(column_name,axis=1)

    filename = 'data/model/model_' + variable+'.pkl'
    if force or not os.path.exists(filename):
        print 'Training ' + variable
        estimator = Pipeline([("imputer", Imputer(strategy="mean",axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,n_estimators=10))])
        estimator.fit(dataset_train_features, dataset_train_target)
        score = cross_val_score(estimator, dataset_train_features, dataset_train_target).mean()
        print("Score with the entire dataset = %.2f" % score)
        joblib.dump(estimator, filename)
    else:
        print 'Loading ' + variable
        estimator = joblib.load(filename)

    dataset.loc[nulls_list,column_name] = estimator.predict(dataset_test_features)


In [None]:
# print dummified_train_x[dummy_columns['v3'].tolist()].iloc[nulls_dict['v3'][0:10]]

In [None]:
# predict all categorical missing values
for variable in variables_to_predict:
    predict_values(dummy_columns[variable].tolist(),dummified_train_x, nulls_dict[variable], variable)

Training v30
Score with the entire dataset = 0.14
          v1        v2        v4        v5        v6        v7        v8  \
2   0.943877  5.310079  4.410969  5.326159  3.979592  3.928571  0.019645   
4        NaN       NaN       NaN       NaN       NaN       NaN       NaN   
5        NaN       NaN       NaN  8.856791       NaN       NaN  0.359993   
8   2.078651  8.462619  3.739030  5.265636  1.573033  2.303371  0.015869   
12       NaN       NaN       NaN       NaN       NaN       NaN       NaN   

           v9       v10        v11   ...     v125__Q  v125__R  v125__S  \
2   12.666667  0.765864  14.756098   ...         0.0      0.0      0.0   
4         NaN  1.050328        NaN   ...         0.0      0.0      0.0   
5         NaN  1.050328        NaN   ...         0.0      0.0      0.0   
8   11.111111  4.463894  16.050955   ...         0.0      0.0      1.0   
12        NaN  1.312911        NaN   ...         0.0      0.0      0.0   

    v125__T  v125__U  v125__V  v125__W  v125__X 