# Mercedes-Benz Kaggle Competition Modeling and Prediction (2017)

## March 3, 2018

## Hiro Miyake

This notebook deals with data provided in the [Mercedes-Benz Kaggle competition](https://www.kaggle.com/c/mercedes-benz-greener-manufacturing) held in 2017. Exploratory data analysis is performed in the companion notebook.

# 1. Load modules and data

In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import SparsePCA
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import r2_score

from xgboost.sklearn import XGBClassifier
from xgboost.sklearn import XGBRegressor



In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# 2. Look at the data and combine the train and test sets

In [3]:
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train.tail()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
4204,8405,107.39,ak,s,as,c,d,aa,d,q,...,1,0,0,0,0,0,0,0,0,0
4205,8406,108.77,j,o,t,d,d,aa,h,h,...,0,1,0,0,0,0,0,0,0,0
4206,8412,109.22,ak,v,r,a,d,aa,g,e,...,0,0,1,0,0,0,0,0,0,0
4207,8415,87.48,al,r,e,f,d,aa,l,u,...,0,0,0,0,0,0,0,0,0,0
4208,8417,110.85,z,r,ae,c,d,aa,g,w,...,1,0,0,0,0,0,0,0,0,0


In [5]:
test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [6]:
data = pd.concat([train.drop('y', axis = 1), test], axis = 0)

## Note that in the above concatenation step, the indices are unchanged
## To reset the indices so that they make sense, take the tip from the following link
## and use the following line of code
## https://stackoverflow.com/questions/35084071/concat-dataframe-reindexing-only-valid-with-uniquely-valued-index-objects
data.reset_index(inplace=True, drop=True)

data.head(10)

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,k,v,at,a,d,u,j,o,0,...,0,0,1,0,0,0,0,0,0,0
1,6,k,t,av,e,d,y,l,o,0,...,1,0,0,0,0,0,0,0,0,0
2,7,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,0,1,0,0,0
3,9,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,0
4,13,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,0
5,18,t,b,e,c,d,g,h,s,0,...,0,0,1,0,0,0,0,0,0,0
6,24,al,r,e,f,d,f,h,s,0,...,0,0,0,0,0,0,0,0,0,0
7,25,o,l,as,f,d,f,j,a,0,...,0,0,0,0,0,0,0,0,0,0
8,27,w,s,as,e,d,f,i,h,0,...,1,0,0,0,0,0,0,0,0,0
9,30,j,b,aq,c,d,f,a,e,0,...,0,0,1,0,0,0,0,0,0,0


# 3. Get a dataframe with just the numerical columns

In [7]:
col_num = list(data.describe().columns)
data_num = data.copy()
for i in data:
    if i not in col_num:
        data_num.drop(i, axis = 1, inplace = True)
data_num.head()

Unnamed: 0,ID,X10,X11,X12,X13,X14,X15,X16,X17,X18,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,6,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
2,7,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
data_num.describe()

Unnamed: 0,ID,X10,X11,X12,X13,X14,X15,X16,X17,X18,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
count,8418.0,8418.0,8418.0,8418.0,8418.0,8418.0,8418.0,8418.0,8418.0,8418.0,...,8418.0,8418.0,8418.0,8418.0,8418.0,8418.0,8418.0,8418.0,8418.0,8418.0
mean,4208.5,0.016156,0.000119,0.074721,0.059515,0.428011,0.000594,0.002613,0.008197,0.009028,...,0.322404,0.053457,0.313376,0.019957,0.010691,0.008078,0.008197,0.001069,0.000594,0.001544
std,2430.211616,0.126082,0.010899,0.262956,0.236601,0.49482,0.024366,0.051058,0.090169,0.094593,...,0.467425,0.224956,0.463893,0.139862,0.102851,0.089519,0.090169,0.032682,0.024366,0.03927
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2104.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4208.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6312.75,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8417.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


The values take on either 0 or 1, so standardization may not gain us much. However, dimensionality reduction could be fruitful.

In [9]:
X = data_num.iloc[:,1:]

#X = StandardScaler().fit_transform(X) ## Subtracts mean and rescales by variance
#X = MaxAbsScaler().fit_transform(X) ## Scales max value to 1.0

#pca = SparsePCA(n_components=500)
pca = PCA(n_components=75, svd_solver = 'randomized')
X = pca.fit(X).transform(X)
#print pca.explained_variance_ratio_
print 'Percent of variance explained: ' + str(100*sum(pca.explained_variance_ratio_)) +'%'

X = pd.DataFrame(X)
X.head()
#X.describe()

Percent of variance explained: 95.3261368413%


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
0,0.717951,2.261372,-1.075744,0.942154,1.223198,-0.059235,0.597748,-0.915438,0.079418,-0.666961,...,-0.1846,-0.054248,0.140438,-0.198112,-0.725767,-0.20316,0.335356,0.217429,0.098959,0.024777
1,-0.167951,0.086776,-1.362879,-0.681619,0.113824,-0.029079,1.249609,-0.50613,-0.122325,0.467179,...,0.124288,0.086874,0.099985,-0.040207,0.051182,-0.303477,0.046045,0.206801,0.059059,0.323974
2,-0.838104,2.166248,-2.132233,2.307177,-1.005637,3.47515,-0.848841,0.497725,-1.030428,-0.198819,...,-0.236898,0.116452,-0.568538,0.303333,0.231561,0.209301,-0.484207,-0.446478,-0.120786,0.098627
3,-0.437887,1.12992,-2.239765,2.670459,-1.764832,3.345078,0.15058,-0.030701,-1.107734,0.116922,...,-0.195656,0.090057,-0.244011,-0.194541,-0.226573,-0.20714,-0.229654,-0.230884,0.285179,-0.034904
4,-0.42094,0.880172,-2.147508,2.855561,-2.083166,3.374394,-0.098551,0.122238,-1.870083,-0.425676,...,-0.078499,0.067382,-0.510347,-0.355934,-0.210913,-0.078208,-0.231647,-0.102408,0.339083,-0.046096


Indeed, it appears that 75 of the top principal components is able to explain 95% of the variance. Let's try to predict with these columns.

Now join the `ID` with these columns.

In [10]:
data_num_f = pd.concat([data_num['ID'], X], axis=1)
data_num_f.head()

Unnamed: 0,ID,0,1,2,3,4,5,6,7,8,...,65,66,67,68,69,70,71,72,73,74
0,0,0.717951,2.261372,-1.075744,0.942154,1.223198,-0.059235,0.597748,-0.915438,0.079418,...,-0.1846,-0.054248,0.140438,-0.198112,-0.725767,-0.20316,0.335356,0.217429,0.098959,0.024777
1,6,-0.167951,0.086776,-1.362879,-0.681619,0.113824,-0.029079,1.249609,-0.50613,-0.122325,...,0.124288,0.086874,0.099985,-0.040207,0.051182,-0.303477,0.046045,0.206801,0.059059,0.323974
2,7,-0.838104,2.166248,-2.132233,2.307177,-1.005637,3.47515,-0.848841,0.497725,-1.030428,...,-0.236898,0.116452,-0.568538,0.303333,0.231561,0.209301,-0.484207,-0.446478,-0.120786,0.098627
3,9,-0.437887,1.12992,-2.239765,2.670459,-1.764832,3.345078,0.15058,-0.030701,-1.107734,...,-0.195656,0.090057,-0.244011,-0.194541,-0.226573,-0.20714,-0.229654,-0.230884,0.285179,-0.034904
4,13,-0.42094,0.880172,-2.147508,2.855561,-2.083166,3.374394,-0.098551,0.122238,-1.870083,...,-0.078499,0.067382,-0.510347,-0.355934,-0.210913,-0.078208,-0.231647,-0.102408,0.339083,-0.046096


# 4. One-hot-encode the categorical variables

Make a dataframe with just the categorical variables.

In [11]:
col_num = list(data.describe().columns)
data_cat = data.copy()
for i in col_num:
    if i != 'ID':
        data_cat.drop(i, axis = 1, inplace = True)
data_cat.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8
0,0,k,v,at,a,d,u,j,o
1,6,k,t,av,e,d,y,l,o
2,7,az,w,n,c,d,x,j,x
3,9,az,t,n,f,d,x,l,e
4,13,az,v,n,f,d,h,d,n


In [12]:
cattot = 0
for i in data_cat:
    if i != 'ID':
        cattot += len(data_cat[i].unique())
        print 'Number of unique ' + i + ' values: ' + str(len(data_cat[i].unique()))
    
print 'Total number of categorical levels: ' + str(cattot)

Number of unique X0 values: 53
Number of unique X1 values: 27
Number of unique X2 values: 50
Number of unique X3 values: 7
Number of unique X4 values: 4
Number of unique X5 values: 33
Number of unique X6 values: 12
Number of unique X8 values: 25
Total number of categorical levels: 211


We see that there are 211 categorical levels. We can one-hot-encode all of these for now.

In [13]:
for i in data_cat:
    if i != 'ID':
        j = pd.get_dummies(data_cat[i])
        data_cat = pd.concat([data_cat, j], axis=1)
        data_cat.drop(i, axis = 1, inplace = True)

data_cat.head()

Unnamed: 0,ID,a,aa,ab,ac,ad,ae,af,ag,ai,...,p,q,r,s,t,u,v,w,x,y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's see if principal components may help us reduce the number of relevant dimensions.

In [14]:
X = data_cat.iloc[:,1:]

#X = StandardScaler().fit_transform(X) ## Subtracts mean and rescales by variance
#X = MaxAbsScaler().fit_transform(X) ## Scales max value to 1.0

#pca = SparsePCA(n_components=500)
pca = PCA(n_components=100, svd_solver = 'randomized')
X = pca.fit(X).transform(X)
#print pca.explained_variance_ratio_
print 'Percent of variance explained: ' + str(100*sum(pca.explained_variance_ratio_)) +'%'

X = pd.DataFrame(X)
X.head()
#X.describe()

Percent of variance explained: 96.2225308139%


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.056437,-0.420917,-0.707521,-0.183865,0.744039,-0.297652,-0.614486,0.048536,-0.40394,0.418802,...,0.188229,-0.057291,-0.043668,-0.024078,-0.022372,-0.02859,-0.047094,-0.036827,-0.003193,-0.015682
1,0.156071,-0.30128,-0.189298,0.282425,0.209564,0.388096,0.131926,0.546344,-0.094731,-0.179046,...,0.187302,-0.007001,-0.067578,-0.046349,0.016818,0.041418,0.071536,-0.05912,0.138822,0.16356
2,-0.485156,-0.28217,-0.464145,-0.519812,-0.349703,-0.493769,-0.204883,0.281649,-0.191837,0.123215,...,-0.09428,0.235884,-0.10027,-0.444968,-0.553974,-0.065895,-0.356201,-0.082956,-0.006997,-0.203266
3,0.816601,-0.509905,-0.182747,-0.040588,-0.119046,0.32131,0.292264,0.694768,-0.295718,0.094655,...,0.002011,0.009203,0.052809,0.055141,-0.054615,0.113368,0.165157,0.128665,-0.025774,0.181639
4,0.848282,-0.536222,-0.077314,-0.200791,-0.311704,0.554991,-0.571327,-0.119833,-0.571162,0.193082,...,0.006761,-0.056961,0.091119,0.008375,-0.087203,0.019636,0.04435,0.048591,-0.035425,0.012859


We see that 100 of the top principal components explains 96% of the variance. We can see how this works.

In [15]:
data_cat_f = pd.concat([data_cat['ID'], X], axis=1)
data_cat_f.head()

Unnamed: 0,ID,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,0,0.056437,-0.420917,-0.707521,-0.183865,0.744039,-0.297652,-0.614486,0.048536,-0.40394,...,0.188229,-0.057291,-0.043668,-0.024078,-0.022372,-0.02859,-0.047094,-0.036827,-0.003193,-0.015682
1,6,0.156071,-0.30128,-0.189298,0.282425,0.209564,0.388096,0.131926,0.546344,-0.094731,...,0.187302,-0.007001,-0.067578,-0.046349,0.016818,0.041418,0.071536,-0.05912,0.138822,0.16356
2,7,-0.485156,-0.28217,-0.464145,-0.519812,-0.349703,-0.493769,-0.204883,0.281649,-0.191837,...,-0.09428,0.235884,-0.10027,-0.444968,-0.553974,-0.065895,-0.356201,-0.082956,-0.006997,-0.203266
3,9,0.816601,-0.509905,-0.182747,-0.040588,-0.119046,0.32131,0.292264,0.694768,-0.295718,...,0.002011,0.009203,0.052809,0.055141,-0.054615,0.113368,0.165157,0.128665,-0.025774,0.181639
4,13,0.848282,-0.536222,-0.077314,-0.200791,-0.311704,0.554991,-0.571327,-0.119833,-0.571162,...,0.006761,-0.056961,0.091119,0.008375,-0.087203,0.019636,0.04435,0.048591,-0.035425,0.012859


# 5. Recombine the numerical and categorical variables

In [16]:
data_f = pd.concat([data_num_f, data_cat_f.iloc[:,1:]], axis=1)
data_f.head(10)

Unnamed: 0,ID,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,0,0.717951,2.261372,-1.075744,0.942154,1.223198,-0.059235,0.597748,-0.915438,0.079418,...,0.188229,-0.057291,-0.043668,-0.024078,-0.022372,-0.02859,-0.047094,-0.036827,-0.003193,-0.015682
1,6,-0.167951,0.086776,-1.362879,-0.681619,0.113824,-0.029079,1.249609,-0.50613,-0.122325,...,0.187302,-0.007001,-0.067578,-0.046349,0.016818,0.041418,0.071536,-0.05912,0.138822,0.16356
2,7,-0.838104,2.166248,-2.132233,2.307177,-1.005637,3.47515,-0.848841,0.497725,-1.030428,...,-0.09428,0.235884,-0.10027,-0.444968,-0.553974,-0.065895,-0.356201,-0.082956,-0.006997,-0.203266
3,9,-0.437887,1.12992,-2.239765,2.670459,-1.764832,3.345078,0.15058,-0.030701,-1.107734,...,0.002011,0.009203,0.052809,0.055141,-0.054615,0.113368,0.165157,0.128665,-0.025774,0.181639
4,13,-0.42094,0.880172,-2.147508,2.855561,-2.083166,3.374394,-0.098551,0.122238,-1.870083,...,0.006761,-0.056961,0.091119,0.008375,-0.087203,0.019636,0.04435,0.048591,-0.035425,0.012859
5,18,1.837444,-0.048876,1.757129,-0.053428,-0.965982,-0.449182,0.452631,-1.035907,-0.971496,...,-0.140986,0.171557,0.097124,0.080193,-0.20913,0.065664,0.12027,0.031378,0.10255,0.007594
6,24,0.549866,-2.015723,-1.410809,0.714424,-0.2131,-0.750852,-0.660139,1.017627,0.400373,...,-0.231546,-0.228565,0.082181,-0.23012,-0.063687,0.019958,-0.011272,-0.023914,-0.040013,0.118552
7,25,-2.50036,-0.264696,0.03385,-0.643495,-0.187746,-1.227553,-0.725444,-0.10733,-1.488354,...,-0.00644,-0.033932,0.002029,-0.008816,-0.054814,-0.008364,0.000244,0.034699,0.001626,-0.016372
8,27,-2.955056,0.915396,0.472139,-1.459801,0.937987,0.763746,-0.494152,0.32545,-0.270127,...,-0.052068,-0.013779,0.029205,-0.069185,0.000862,-0.019984,0.000581,-0.030302,0.014976,0.022377
9,30,1.560485,1.073928,2.200145,0.17662,-0.392351,-0.083017,0.110902,0.744905,0.116567,...,-0.180036,-0.213831,0.234515,-0.048079,-0.006026,-0.048633,-0.014961,0.041823,-0.101637,-0.045901


# 6. Split the data back into training and test sets

Rename the column names to consecutive numbers. This seems to solve a weird issue with xgboost.

In [17]:
dfdim = data_f.shape
data_f.columns = range(dfdim[1])
data_f.rename(columns={0: 'ID'}, inplace=True)
data_f.head()

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,...,166,167,168,169,170,171,172,173,174,175
0,0,0.717951,2.261372,-1.075744,0.942154,1.223198,-0.059235,0.597748,-0.915438,0.079418,...,0.188229,-0.057291,-0.043668,-0.024078,-0.022372,-0.02859,-0.047094,-0.036827,-0.003193,-0.015682
1,6,-0.167951,0.086776,-1.362879,-0.681619,0.113824,-0.029079,1.249609,-0.50613,-0.122325,...,0.187302,-0.007001,-0.067578,-0.046349,0.016818,0.041418,0.071536,-0.05912,0.138822,0.16356
2,7,-0.838104,2.166248,-2.132233,2.307177,-1.005637,3.47515,-0.848841,0.497725,-1.030428,...,-0.09428,0.235884,-0.10027,-0.444968,-0.553974,-0.065895,-0.356201,-0.082956,-0.006997,-0.203266
3,9,-0.437887,1.12992,-2.239765,2.670459,-1.764832,3.345078,0.15058,-0.030701,-1.107734,...,0.002011,0.009203,0.052809,0.055141,-0.054615,0.113368,0.165157,0.128665,-0.025774,0.181639
4,13,-0.42094,0.880172,-2.147508,2.855561,-2.083166,3.374394,-0.098551,0.122238,-1.870083,...,0.006761,-0.056961,0.091119,0.008375,-0.087203,0.019636,0.04435,0.048591,-0.035425,0.012859


Note from near the beginning that 4208 is the last index of the training set.

In [18]:
train_f = data_f.iloc[:4209,:]
test_f = data_f.iloc[4209:,:]

In [19]:
train_f.tail()

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,...,166,167,168,169,170,171,172,173,174,175
4204,8405,-2.236734,1.229688,0.505092,-1.166356,1.084891,0.683361,0.217062,0.004005,0.294123,...,-0.004997,0.05379,0.040451,-0.007071,0.05478,-0.052729,-0.006518,-0.024538,0.020475,0.002923
4205,8406,0.915265,0.247165,-0.153762,1.818164,1.373741,-0.572551,2.688097,-0.402615,-1.038147,...,0.272692,0.111142,0.135182,0.410525,-0.113444,-0.048761,0.003018,0.05269,0.097559,-0.241159
4206,8412,-1.105689,1.7174,0.093771,-0.167163,-0.986769,0.337386,2.079262,-1.841212,1.940837,...,-0.050375,0.010431,-0.037851,0.000447,0.028442,0.001483,0.020921,0.049098,-0.071751,-0.020702
4207,8415,0.533334,-2.664687,-1.409888,1.419753,-0.093037,-0.789834,-0.138047,-0.787004,-0.394408,...,-0.311256,-0.237852,0.044533,-0.18401,-0.080617,0.030263,-0.004842,-0.036734,-0.045118,0.117541
4208,8417,0.858533,-0.851817,-0.941311,-0.897606,-0.123301,-1.225487,0.615723,1.979975,-0.205655,...,-0.02004,-0.098495,0.003884,0.048855,-0.018704,-0.033705,-0.060223,-0.001187,0.042752,-0.107513


In [20]:
test_f.head()

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,...,166,167,168,169,170,171,172,173,174,175
4209,1,-0.326163,0.957147,-2.293272,2.974925,-1.998627,3.3806,-0.19861,0.057237,-1.835552,...,-0.03832,-0.052604,0.14167,0.032704,-0.046526,0.014409,0.049753,0.050497,-0.013039,0.031356
4210,2,3.695239,1.551435,0.533166,-0.64253,1.384706,0.138905,-1.548728,-1.313239,-1.098733,...,0.029458,0.08712,-0.037674,0.075705,0.006124,-0.033629,-0.021996,-0.007325,-0.062001,-0.011794
4211,3,-1.150464,0.767044,-0.621437,1.48427,-0.493003,3.146367,-0.553224,0.681075,-1.359664,...,-0.090191,0.100293,-0.109216,0.024992,0.405317,-0.086225,-0.086474,0.00728,-0.080522,-0.151748
4212,4,-0.320946,0.849468,-2.276091,2.873227,-2.133013,3.417245,-0.144051,0.003839,-1.85885,...,-0.016617,-0.14144,0.025018,0.054337,-0.16129,0.027068,0.024606,0.037784,0.038564,0.004283
4213,5,-2.890829,0.356232,0.864142,-1.54797,0.692868,0.294165,-0.110402,0.124403,-0.119714,...,0.011459,0.004775,0.014768,0.013238,0.025566,-0.015125,-0.01442,0.007955,-0.008795,-0.006528


Add back the target variable to the training set.

In [21]:
train_f2 = pd.merge(train[['ID', 'y']], train_f, on = 'ID')
train_f2.head()

Unnamed: 0,ID,y,1,2,3,4,5,6,7,8,...,166,167,168,169,170,171,172,173,174,175
0,0,130.81,0.717951,2.261372,-1.075744,0.942154,1.223198,-0.059235,0.597748,-0.915438,...,0.188229,-0.057291,-0.043668,-0.024078,-0.022372,-0.02859,-0.047094,-0.036827,-0.003193,-0.015682
1,6,88.53,-0.167951,0.086776,-1.362879,-0.681619,0.113824,-0.029079,1.249609,-0.50613,...,0.187302,-0.007001,-0.067578,-0.046349,0.016818,0.041418,0.071536,-0.05912,0.138822,0.16356
2,7,76.26,-0.838104,2.166248,-2.132233,2.307177,-1.005637,3.47515,-0.848841,0.497725,...,-0.09428,0.235884,-0.10027,-0.444968,-0.553974,-0.065895,-0.356201,-0.082956,-0.006997,-0.203266
3,9,80.62,-0.437887,1.12992,-2.239765,2.670459,-1.764832,3.345078,0.15058,-0.030701,...,0.002011,0.009203,0.052809,0.055141,-0.054615,0.113368,0.165157,0.128665,-0.025774,0.181639
4,13,78.02,-0.42094,0.880172,-2.147508,2.855561,-2.083166,3.374394,-0.098551,0.122238,...,0.006761,-0.056961,0.091119,0.008375,-0.087203,0.019636,0.04435,0.048591,-0.035425,0.012859


# 7. Modeling and prediction

In [22]:
## For continuous variable prediction
train_train, train_test = train_test_split(train_f2, train_size=0.7, 
                                                             random_state=0)
## For discrete variable prediction
#train_train, train_test = train_test_split(train_f2, train_size=0.7, 
#                                                             random_state=0, stratify = train_f2['y'])

In [23]:
## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
## Smaller C => stronger regularization. 10000 and 1000 makes no difference.
#model = linear_model.LinearRegression()
#model = linear_model.LogisticRegression(C = 10000, solver = 'sag', multi_class = 'multinomial', max_iter = 500)

## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
## max_depth controls for regularization; smaller the more regularization
#model = RandomForestClassifier(max_depth=5, random_state=0)
#model = RandomForestClassifier(max_depth = 30, random_state=0)
#model = RandomForestRegressor(max_depth = 5, random_state = 0)

## http://xgboost.readthedocs.io/en/latest/parameter.html
## http://xgboost.readthedocs.io/en/latest/python/python_api.html
#model = XGBClassifier(max_depth=10, learning_rate=1.0, n_estimators=100,
#                    objective='binary:logistic', subsample=1.0, colsample_bytree=0.6, seed=0)
#model = XGBClassifier(max_depth=10, learning_rate=1.0, n_estimators=100,
#                    objective='binary:logistic', subsample=1.0, colsample_bytree=0.6, seed=0, reg_lambda = 1000)
model = XGBRegressor(max_depth=10, learning_rate=1.0, n_estimators=100,
                    objective='reg:linear', subsample=1.0, colsample_bytree=0.6, seed=0, reg_lambda = 9000)

In [24]:
model.fit(train_train.iloc[:, 2:], train_train['y'])

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.6,
       gamma=0, learning_rate=1.0, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=9000,
       scale_pos_weight=1, seed=0, silent=True, subsample=1.0)

In [25]:
## http://scikit-learn.org/stable/modules/model_evaluation.html

pred_train = model.predict(train_train.iloc[:, 2:])
#pred_train = model.predict_proba(train_train.iloc[:, 2:])
score = r2_score(train_train['y'], pred_train)
print 'Score for the training set: ' + str(score)

pred_train = model.predict(train_test.iloc[:, 2:])
#pred_train = model.predict_proba(train_test.iloc[:, 2:])
score = r2_score(train_test['y'], pred_train)
print 'Score for the validation set: ' + str(score)

Score for the training set: 0.723966881716
Score for the validation set: 0.467844316547


In [26]:
test_f.head()

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,...,166,167,168,169,170,171,172,173,174,175
4209,1,-0.326163,0.957147,-2.293272,2.974925,-1.998627,3.3806,-0.19861,0.057237,-1.835552,...,-0.03832,-0.052604,0.14167,0.032704,-0.046526,0.014409,0.049753,0.050497,-0.013039,0.031356
4210,2,3.695239,1.551435,0.533166,-0.64253,1.384706,0.138905,-1.548728,-1.313239,-1.098733,...,0.029458,0.08712,-0.037674,0.075705,0.006124,-0.033629,-0.021996,-0.007325,-0.062001,-0.011794
4211,3,-1.150464,0.767044,-0.621437,1.48427,-0.493003,3.146367,-0.553224,0.681075,-1.359664,...,-0.090191,0.100293,-0.109216,0.024992,0.405317,-0.086225,-0.086474,0.00728,-0.080522,-0.151748
4212,4,-0.320946,0.849468,-2.276091,2.873227,-2.133013,3.417245,-0.144051,0.003839,-1.85885,...,-0.016617,-0.14144,0.025018,0.054337,-0.16129,0.027068,0.024606,0.037784,0.038564,0.004283
4213,5,-2.890829,0.356232,0.864142,-1.54797,0.692868,0.294165,-0.110402,0.124403,-0.119714,...,0.011459,0.004775,0.014768,0.013238,0.025566,-0.015125,-0.01442,0.007955,-0.008795,-0.006528


In [27]:
x = model.predict(test_f.iloc[:,1:])
#x = model.predict_proba(test_f.iloc[:,1:])
x = pd.DataFrame(x)

## Copy and paste column names from sample submission file
predcols = ['y']
x.columns = predcols
x = pd.concat([test['ID'], x], axis = 1)
x.head()

Unnamed: 0,ID,y
0,1,83.027206
1,2,90.930641
2,3,75.935303
3,4,80.256821
4,5,112.415184


In [28]:
## Don't keep the indices
## https://stackoverflow.com/questions/16923281/pandas-writing-dataframe-to-csv-file
x.to_csv("submission/submit_1.csv", index=False)