# Liberty Mutual Kaggle Competition Modeling and Prediction (2015)

## March 2, 2018

## Hiro Miyake

This notebook deals with data provided in the [Liberty Mutual Kaggle competition](https://www.kaggle.com/c/liberty-mutual-group-property-inspection-prediction) held in 2015. Exploratory data analysis is performed in the companion notebook.

# 1. Load modules and data

In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import SparsePCA
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

from xgboost.sklearn import XGBClassifier
from xgboost.sklearn import XGBRegressor



In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# 2. Peek at the data and combine the training and test sets

In [3]:
train.head()

Unnamed: 0,Id,Hazard,T1_V1,T1_V2,T1_V3,T1_V4,T1_V5,T1_V6,T1_V7,T1_V8,...,T2_V6,T2_V7,T2_V8,T2_V9,T2_V10,T2_V11,T2_V12,T2_V13,T2_V14,T2_V15
0,1,1,15,3,2,N,B,N,B,B,...,2,37,1,11,6,Y,N,E,2,2
1,2,4,16,14,5,H,B,N,B,B,...,2,22,1,18,5,Y,Y,E,2,1
2,3,1,10,10,5,N,K,N,B,B,...,6,37,2,14,6,Y,Y,E,6,1
3,4,1,18,18,5,N,K,N,B,B,...,2,25,1,1,6,Y,N,C,2,6
4,5,1,13,19,5,N,H,N,B,B,...,1,22,1,2,7,N,N,E,1,1


In [4]:
train.tail()

Unnamed: 0,Id,Hazard,T1_V1,T1_V2,T1_V3,T1_V4,T1_V5,T1_V6,T1_V7,T1_V8,...,T2_V6,T2_V7,T2_V8,T2_V9,T2_V10,T2_V11,T2_V12,T2_V13,T2_V14,T2_V15
50994,101992,7,12,24,1,N,H,Y,B,B,...,4,25,2,9,7,Y,Y,E,4,1
50995,101993,4,12,17,4,N,K,N,B,B,...,2,37,1,4,7,N,N,A,5,2
50996,101994,3,18,7,5,W,B,Y,B,B,...,2,28,1,1,2,N,N,E,1,1
50997,101998,14,18,17,5,B,A,N,B,D,...,1,40,1,7,4,Y,N,C,2,6
50998,101999,9,5,15,3,B,I,Y,B,B,...,2,40,1,16,1,Y,N,E,5,4


In [5]:
test.head()

Unnamed: 0,Id,T1_V1,T1_V2,T1_V3,T1_V4,T1_V5,T1_V6,T1_V7,T1_V8,T1_V9,...,T2_V6,T2_V7,T2_V8,T2_V9,T2_V10,T2_V11,T2_V12,T2_V13,T2_V14,T2_V15
0,6,2,13,4,C,A,Y,B,B,D,...,2,28,1,22,6,Y,N,E,2,7
1,7,10,10,7,N,C,Y,B,B,D,...,3,28,1,4,3,Y,N,E,5,8
2,8,9,20,4,N,H,Y,B,B,E,...,2,22,1,1,7,N,N,C,6,1
3,9,11,18,2,N,H,Y,B,B,D,...,4,40,1,20,6,Y,N,E,5,5
4,10,4,5,4,H,K,Y,B,B,E,...,2,34,1,11,7,Y,Y,E,2,1


In [6]:
data = pd.concat([train.drop('Hazard', axis = 1), test], axis = 0)

## Note that in the above concatenation step, the indices are unchanged
## To reset the indices so that they make sense, take the tip from the following link
## and use the following line of code
## https://stackoverflow.com/questions/35084071/concat-dataframe-reindexing-only-valid-with-uniquely-valued-index-objects
data.reset_index(inplace=True, drop=True)

data.head(10)

Unnamed: 0,Id,T1_V1,T1_V2,T1_V3,T1_V4,T1_V5,T1_V6,T1_V7,T1_V8,T1_V9,...,T2_V6,T2_V7,T2_V8,T2_V9,T2_V10,T2_V11,T2_V12,T2_V13,T2_V14,T2_V15
0,1,15,3,2,N,B,N,B,B,D,...,2,37,1,11,6,Y,N,E,2,2
1,2,16,14,5,H,B,N,B,B,C,...,2,22,1,18,5,Y,Y,E,2,1
2,3,10,10,5,N,K,N,B,B,E,...,6,37,2,14,6,Y,Y,E,6,1
3,4,18,18,5,N,K,N,B,B,E,...,2,25,1,1,6,Y,N,C,2,6
4,5,13,19,5,N,H,N,B,B,E,...,1,22,1,2,7,N,N,E,1,1
5,12,14,12,2,N,K,N,B,B,E,...,1,37,1,5,7,N,N,A,1,9
6,15,8,17,1,E,K,N,B,B,E,...,2,25,1,20,3,Y,N,D,2,11
7,19,14,20,4,E,K,N,B,B,E,...,2,40,1,18,7,Y,N,E,3,2
8,21,8,2,2,W,C,N,D,B,D,...,1,34,1,13,5,N,N,A,2,1
9,22,5,4,3,B,I,N,D,B,F,...,4,40,1,6,3,Y,Y,E,4,1


# 3. Get the numerical columns

In [7]:
col_num = list(data.describe().columns)
data_num = data.copy()
for i in data:
    if i not in col_num:
        data_num.drop(i, axis = 1, inplace = True)
data_num.head()

Unnamed: 0,Id,T1_V1,T1_V2,T1_V3,T1_V10,T1_V13,T1_V14,T2_V1,T2_V2,T2_V4,T2_V6,T2_V7,T2_V8,T2_V9,T2_V10,T2_V14,T2_V15
0,1,15,3,2,7,15,1,36,11,10,2,37,1,11,6,2,2
1,2,16,14,5,12,10,3,78,10,17,2,22,1,18,5,2,1
2,3,10,10,5,12,15,1,71,21,13,6,37,2,14,6,6,1
3,4,18,18,5,3,15,1,71,13,15,2,25,1,1,6,2,6
4,5,13,19,5,7,10,1,75,10,11,1,22,1,2,7,1,1


Let's get summary statistics on the numerical columns.

In [8]:
data_num.describe()

Unnamed: 0,Id,T1_V1,T1_V2,T1_V3,T1_V10,T1_V13,T1_V14,T2_V1,T2_V2,T2_V4,T2_V6,T2_V7,T2_V8,T2_V9,T2_V10,T2_V14,T2_V15
count,101999.0,101999.0,101999.0,101999.0,101999.0,101999.0,101999.0,101999.0,101999.0,101999.0,101999.0,101999.0,101999.0,101999.0,101999.0,101999.0,101999.0
mean,51000.0,9.716693,12.866126,3.188531,7.035147,13.994353,1.581417,57.639849,12.43523,10.240002,1.950117,33.48826,1.032461,12.501387,4.489652,2.453269,3.483544
std,29444.719391,5.169488,6.242991,1.739655,3.593765,4.65434,0.863821,23.494165,4.802507,4.847544,0.796459,5.826087,0.194926,7.321876,1.895475,1.257543,3.071123
min,1.0,1.0,1.0,1.0,2.0,5.0,0.0,1.0,1.0,1.0,1.0,22.0,1.0,1.0,1.0,1.0,1.0
25%,25500.5,6.0,7.0,2.0,3.0,10.0,1.0,40.0,9.0,6.0,2.0,31.0,1.0,6.0,3.0,2.0,1.0
50%,51000.0,9.0,14.0,3.0,8.0,15.0,1.0,56.0,11.0,10.0,2.0,34.0,1.0,14.0,4.0,2.0,2.0
75%,76499.5,14.0,18.0,4.0,12.0,20.0,2.0,77.0,15.0,14.0,2.0,40.0,1.0,18.0,6.0,3.0,5.0
max,101999.0,19.0,24.0,9.0,12.0,20.0,4.0,100.0,39.0,22.0,7.0,40.0,3.0,25.0,7.0,7.0,12.0


Note that the values take on discrete values, so it's possible each value can be taken as a categorical variable. But for now, let's treat them as taking on continuous, numerical values.

We also see that the values take a range from almost 0 to almost 100. One transformation we could apply may be to take the logarithm of the values. Another approach which I will take is to rescale them so that they all lie roughly between 0 and 1.

In [9]:
X = data_num.iloc[:,1:]

#X = StandardScaler().fit_transform(X) ## Subtracts mean and rescales by variance
X = MaxAbsScaler().fit_transform(X) ## Scales max value to 1.0

#pca = SparsePCA(n_components=500)
#pca = PCA(n_components=500, svd_solver = 'randomized')
#X = pca.fit(X).transform(X)
#print pca.explained_variance_ratio_
#print 'Percent of variance explained: ' + str(100*sum(pca.explained_variance_ratio_)) +'%'

X = pd.DataFrame(X)
X.head()
#X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.789474,0.125,0.222222,0.583333,0.75,0.25,0.36,0.282051,0.454545,0.285714,0.925,0.333333,0.44,0.857143,0.285714,0.166667
1,0.842105,0.583333,0.555556,1.0,0.5,0.75,0.78,0.25641,0.772727,0.285714,0.55,0.333333,0.72,0.714286,0.285714,0.083333
2,0.526316,0.416667,0.555556,1.0,0.75,0.25,0.71,0.538462,0.590909,0.857143,0.925,0.666667,0.56,0.857143,0.857143,0.083333
3,0.947368,0.75,0.555556,0.25,0.75,0.25,0.71,0.333333,0.681818,0.285714,0.625,0.333333,0.04,0.857143,0.285714,0.5
4,0.684211,0.791667,0.555556,0.583333,0.5,0.25,0.75,0.25641,0.5,0.142857,0.55,0.333333,0.08,1.0,0.142857,0.083333


In [10]:
data_num_f = pd.concat([data_num['Id'], X], axis=1)
data_num_f.head()

Unnamed: 0,Id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1,0.789474,0.125,0.222222,0.583333,0.75,0.25,0.36,0.282051,0.454545,0.285714,0.925,0.333333,0.44,0.857143,0.285714,0.166667
1,2,0.842105,0.583333,0.555556,1.0,0.5,0.75,0.78,0.25641,0.772727,0.285714,0.55,0.333333,0.72,0.714286,0.285714,0.083333
2,3,0.526316,0.416667,0.555556,1.0,0.75,0.25,0.71,0.538462,0.590909,0.857143,0.925,0.666667,0.56,0.857143,0.857143,0.083333
3,4,0.947368,0.75,0.555556,0.25,0.75,0.25,0.71,0.333333,0.681818,0.285714,0.625,0.333333,0.04,0.857143,0.285714,0.5
4,5,0.684211,0.791667,0.555556,0.583333,0.5,0.25,0.75,0.25641,0.5,0.142857,0.55,0.333333,0.08,1.0,0.142857,0.083333


# 4. One-hot-encode the categorical columns

First extract the categorical columns and create a dataframe only of categorical variables.

In [11]:
col_num = list(data.describe().columns)
data_cat = data.copy()
for i in col_num:
    if i != 'Id':
        data_cat.drop(i, axis = 1, inplace = True)
data_cat.head()

Unnamed: 0,Id,T1_V4,T1_V5,T1_V6,T1_V7,T1_V8,T1_V9,T1_V11,T1_V12,T1_V15,T1_V16,T1_V17,T2_V3,T2_V5,T2_V11,T2_V12,T2_V13
0,1,N,B,N,B,B,D,B,B,A,B,N,N,B,Y,N,E
1,2,H,B,N,B,B,C,B,B,A,B,Y,Y,C,Y,Y,E
2,3,N,K,N,B,B,E,H,B,A,R,Y,Y,C,Y,Y,E
3,4,N,K,N,B,B,E,H,B,A,R,N,N,A,Y,N,C
4,5,N,H,N,B,B,E,H,B,A,J,N,Y,B,N,N,E


In [12]:
cattot = 0
for i in data_cat:
    if i != 'Id':
        cattot += len(data_cat[i].unique())
        print 'Number of unique ' + i + ' values: ' + str(len(data_cat[i].unique()))
    
print 'Total number of categorical levels: ' + str(cattot)

Number of unique T1_V4 values: 8
Number of unique T1_V5 values: 10
Number of unique T1_V6 values: 2
Number of unique T1_V7 values: 4
Number of unique T1_V8 values: 4
Number of unique T1_V9 values: 6
Number of unique T1_V11 values: 12
Number of unique T1_V12 values: 4
Number of unique T1_V15 values: 8
Number of unique T1_V16 values: 18
Number of unique T1_V17 values: 2
Number of unique T2_V3 values: 2
Number of unique T2_V5 values: 6
Number of unique T2_V11 values: 2
Number of unique T2_V12 values: 2
Number of unique T2_V13 values: 5
Total number of categorical levels: 95


We see that there are 95 categorical levels. This is not so many, so we can try to include all of them for now.

In [13]:
for i in data_cat:
    if i != 'Id':
        j = pd.get_dummies(data_cat[i])
        data_cat = pd.concat([data_cat, j], axis=1)
        data_cat.drop(i, axis = 1, inplace = True)

data_cat.head()

Unnamed: 0,Id,B,C,E,G,H,N,S,W,A,...,F,N.1,Y,N.2,Y.1,A.1,B.1,C.1,D,E.1
0,1,0,0,0,0,0,1,0,0,0,...,0,0,1,1,0,0,0,0,0,1
1,2,0,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1
2,3,0,0,0,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1
3,4,0,0,0,0,0,1,0,0,0,...,0,0,1,1,0,0,0,1,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,0,1


In [14]:
data_cat_f = data_cat.copy()

# 5. Recombine the numerical and categorical variables

In [15]:
data_f = pd.concat([data_num_f, data_cat_f.iloc[:,1:]], axis=1)
data_f.head(10)

Unnamed: 0,Id,0,1,2,3,4,5,6,7,8,...,F,N,Y,N.1,Y.1,A,B,C,D,E
0,1,0.789474,0.125,0.222222,0.583333,0.75,0.25,0.36,0.282051,0.454545,...,0,0,1,1,0,0,0,0,0,1
1,2,0.842105,0.583333,0.555556,1.0,0.5,0.75,0.78,0.25641,0.772727,...,0,0,1,0,1,0,0,0,0,1
2,3,0.526316,0.416667,0.555556,1.0,0.75,0.25,0.71,0.538462,0.590909,...,0,0,1,0,1,0,0,0,0,1
3,4,0.947368,0.75,0.555556,0.25,0.75,0.25,0.71,0.333333,0.681818,...,0,0,1,1,0,0,0,1,0,0
4,5,0.684211,0.791667,0.555556,0.583333,0.5,0.25,0.75,0.25641,0.5,...,0,1,0,1,0,0,0,0,0,1
5,12,0.736842,0.5,0.222222,1.0,0.75,0.25,0.65,0.25641,0.636364,...,0,1,0,1,0,1,0,0,0,0
6,15,0.421053,0.708333,0.111111,0.666667,1.0,0.25,1.0,0.358974,0.727273,...,0,0,1,1,0,0,0,0,1,0
7,19,0.736842,0.833333,0.444444,0.25,0.75,0.25,0.83,0.333333,0.227273,...,0,0,1,1,0,0,0,0,0,1
8,21,0.421053,0.083333,0.222222,0.666667,0.25,0.25,0.2,0.307692,0.181818,...,0,1,0,1,0,1,0,0,0,0
9,22,0.263158,0.166667,0.333333,0.666667,1.0,0.75,0.88,0.179487,0.636364,...,0,0,1,0,1,0,0,0,0,1


# 6. Split the data back into training and test sets

In [16]:
dfdim = data_f.shape
data_f.columns = range(dfdim[1])
data_f.rename(columns={0: 'Id'}, inplace=True)
data_f.head()

Unnamed: 0,Id,1,2,3,4,5,6,7,8,9,...,102,103,104,105,106,107,108,109,110,111
0,1,0.789474,0.125,0.222222,0.583333,0.75,0.25,0.36,0.282051,0.454545,...,0,0,1,1,0,0,0,0,0,1
1,2,0.842105,0.583333,0.555556,1.0,0.5,0.75,0.78,0.25641,0.772727,...,0,0,1,0,1,0,0,0,0,1
2,3,0.526316,0.416667,0.555556,1.0,0.75,0.25,0.71,0.538462,0.590909,...,0,0,1,0,1,0,0,0,0,1
3,4,0.947368,0.75,0.555556,0.25,0.75,0.25,0.71,0.333333,0.681818,...,0,0,1,1,0,0,0,1,0,0
4,5,0.684211,0.791667,0.555556,0.583333,0.5,0.25,0.75,0.25641,0.5,...,0,1,0,1,0,0,0,0,0,1


Note from near the beginning that 50998 is the last index of the training set.

In [17]:
train_f = data_f.iloc[:50999,:]
test_f = data_f.iloc[50999:,:]

In [18]:
train_f.tail()

Unnamed: 0,Id,1,2,3,4,5,6,7,8,9,...,102,103,104,105,106,107,108,109,110,111
50994,101992,0.631579,1.0,0.111111,1.0,0.5,0.25,0.64,0.230769,0.272727,...,0,0,1,0,1,0,0,0,0,1
50995,101993,0.631579,0.708333,0.444444,0.25,0.75,0.25,0.75,0.25641,0.454545,...,0,1,0,1,0,1,0,0,0,0
50996,101994,0.947368,0.291667,0.555556,0.666667,1.0,0.5,0.33,0.333333,0.136364,...,0,1,0,1,0,0,0,0,0,1
50997,101998,0.947368,0.708333,0.555556,0.666667,0.5,0.25,0.35,0.282051,0.818182,...,0,0,1,1,0,0,0,1,0,0
50998,101999,0.263158,0.625,0.333333,0.666667,0.75,0.75,0.49,0.25641,0.272727,...,0,0,1,1,0,0,0,0,0,1


In [19]:
test_f.head()

Unnamed: 0,Id,1,2,3,4,5,6,7,8,9,...,102,103,104,105,106,107,108,109,110,111
50999,6,0.105263,0.541667,0.444444,1.0,0.75,0.5,0.48,0.282051,0.318182,...,0,0,1,1,0,0,0,0,0,1
51000,7,0.526316,0.416667,0.777778,0.666667,0.75,0.25,0.14,0.25641,0.636364,...,0,0,1,1,0,0,0,0,0,1
51001,8,0.473684,0.833333,0.444444,0.666667,1.0,0.5,0.19,0.307692,0.272727,...,0,1,0,1,0,0,0,1,0,0
51002,9,0.578947,0.75,0.222222,1.0,0.25,0.25,0.51,0.589744,0.454545,...,0,0,1,1,0,0,0,0,0,1
51003,10,0.210526,0.208333,0.444444,0.666667,0.75,0.75,0.75,0.230769,0.681818,...,0,0,1,0,1,0,0,0,0,1


In [20]:
train_f2 = pd.merge(train[['Id', "Hazard"]], train_f, on = "Id")
train_f2.head()

Unnamed: 0,Id,Hazard,1,2,3,4,5,6,7,8,...,102,103,104,105,106,107,108,109,110,111
0,1,1,0.789474,0.125,0.222222,0.583333,0.75,0.25,0.36,0.282051,...,0,0,1,1,0,0,0,0,0,1
1,2,4,0.842105,0.583333,0.555556,1.0,0.5,0.75,0.78,0.25641,...,0,0,1,0,1,0,0,0,0,1
2,3,1,0.526316,0.416667,0.555556,1.0,0.75,0.25,0.71,0.538462,...,0,0,1,0,1,0,0,0,0,1
3,4,1,0.947368,0.75,0.555556,0.25,0.75,0.25,0.71,0.333333,...,0,0,1,1,0,0,0,1,0,0
4,5,1,0.684211,0.791667,0.555556,0.583333,0.5,0.25,0.75,0.25641,...,0,1,0,1,0,0,0,0,0,1


# 7. Modeling and prediction

As can be seen from the exploratory data analysis, there are a few large `Hazard` values which appear only once in the training set. Apparently, the `train_test_split` module cannot deal with that, so we have to do something about those low frequency events if we want to be able to perform some sort of validation to prevent overfitting.

Here, I set all the `Hazard` values which occur only once to the same value, namely 50.

In [21]:
## Some tips on extracting information from value_counts()
## https://stackoverflow.com/questions/35364601/group-by-and-find-top-n-value-counts-pandas
## https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.nlargest.html
## https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.value_counts.html
## https://codereview.stackexchange.com/questions/149306/select-the-n-most-frequent-items-from-a-pandas-groupby-dataframe
## https://stackoverflow.com/questions/35523635/extract-values-in-pandas-value-counts

#print train_f2['Hazard'].value_counts() ## This shows there are 9 values which only occur once
#train_f2['Hazard'].value_counts().nsmallest(9)
hazvals = set(train_f2['Hazard'].value_counts().nsmallest(9).index)

In [22]:
def set_hazard(x):
    if x in hazvals:
        return 50
    else:
        return x

## https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html
## https://chrisalbon.com/python/data_wrangling/pandas_apply_operations_to_dataframes/
train_f2['Hazard'] = train_f2['Hazard'].apply(set_hazard)

## Check to see that the Hazard values with frequency 1 are appropriately set to 50
#train_f2['Hazard'].value_counts()

In [23]:
train_train, train_test = train_test_split(train_f2, train_size=0.7, 
                                                             random_state=0, stratify = train_f2['Hazard'])

The evaluation metric for this competition is the normalized Gini coefficient. The best place to find a Python implementation seems to be [here](https://www.kaggle.com/wcukierski/official-gini-metric-code-in-c). The block below also includes a few other sources of information.

In [24]:
## Tips on prediction with the normalized Gini coefficient
## https://en.wikipedia.org/wiki/Gini_coefficient
## https://www.kaggle.com/wcukierski/official-gini-metric-code-in-c
## https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703
## https://www.kaggle.com/c/liberty-mutual-fire-peril/discussion/9880
## https://www.kaggle.com/oxofff/gini-scorer-cv-gridsearch
## https://www.kaggle.com/jpopham91/gini-scoring-simple-and-efficient
## https://www.kaggle.com/rmealey/calculating-normalized-gini-coefficient
##
## https://www.kaggle.com/mathcass/tips-for-using-scikit-learn-for-evaluation
## http://scikit-learn.org/stable/modules/model_evaluation.html

## Gini code from
## https://www.kaggle.com/wcukierski/official-gini-metric-code-in-c
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

## Test case
#d = [5.1, 3.2, 1.7, 6.2, 8.1]
#p = [3.1, 5.2, 2.7, 5.1, 1.1]
#print gini_normalized(d,p)
#-0.33544303797468322

In [25]:
## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
## Smaller C => stronger regularization. 10000 and 1000 makes no difference.
#model = linear_model.LinearRegression()
#model = linear_model.LogisticRegression(C = 10000, solver = 'sag', multi_class = 'multinomial', max_iter = 500)

## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
## max_depth controls for regularization; smaller the more regularization
#model = RandomForestClassifier(max_depth=5, random_state=0)
#model = RandomForestClassifier(max_depth = 30, random_state=0)
#model = RandomForestRegressor(max_depth = 5, random_state = 0)

## http://xgboost.readthedocs.io/en/latest/parameter.html
## http://xgboost.readthedocs.io/en/latest/python/python_api.html
#model = XGBClassifier(max_depth=10, learning_rate=1.0, n_estimators=100,
#                    objective='binary:logistic', subsample=1.0, colsample_bytree=0.6, seed=0)
#model = XGBClassifier(max_depth=10, learning_rate=1.0, n_estimators=100,
#                    objective='binary:logistic', subsample=1.0, colsample_bytree=0.6, seed=0, reg_lambda = 1000)
model = XGBRegressor(max_depth=10, learning_rate=1.0, n_estimators=100,
                    objective='reg:linear', subsample=1.0, colsample_bytree=0.6, seed=0, reg_lambda = 50000)

In [26]:
model.fit(train_train.iloc[:, 2:], train_train["Hazard"])

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.6,
       gamma=0, learning_rate=1.0, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=50000,
       scale_pos_weight=1, seed=0, silent=True, subsample=1.0)

In [27]:
pred_train = model.predict(train_train.iloc[:, 2:])
#pred_train = model.predict_proba(train_train.iloc[:, 2:])
score = gini_normalized(train_train["Hazard"], pred_train)
print 'Score for the training set: ' + str(score)

pred_train = model.predict(train_test.iloc[:, 2:])
#pred_train = model.predict_proba(train_test.iloc[:, 2:])
score = gini_normalized(train_test["Hazard"], pred_train)
print 'Score for the validation set: ' + str(score)

Score for the training set: 0.430335267262
Score for the validation set: 0.380274393322


In [28]:
test_f.head()

Unnamed: 0,Id,1,2,3,4,5,6,7,8,9,...,102,103,104,105,106,107,108,109,110,111
50999,6,0.105263,0.541667,0.444444,1.0,0.75,0.5,0.48,0.282051,0.318182,...,0,0,1,1,0,0,0,0,0,1
51000,7,0.526316,0.416667,0.777778,0.666667,0.75,0.25,0.14,0.25641,0.636364,...,0,0,1,1,0,0,0,0,0,1
51001,8,0.473684,0.833333,0.444444,0.666667,1.0,0.5,0.19,0.307692,0.272727,...,0,1,0,1,0,0,0,1,0,0
51002,9,0.578947,0.75,0.222222,1.0,0.25,0.25,0.51,0.589744,0.454545,...,0,0,1,1,0,0,0,0,0,1
51003,10,0.210526,0.208333,0.444444,0.666667,0.75,0.75,0.75,0.230769,0.681818,...,0,0,1,0,1,0,0,0,0,1


In [29]:
x = model.predict(test_f.iloc[:,1:])
#x = model.predict_proba(test_f.iloc[:,1:])
x = pd.DataFrame(x)

## Copy and paste column names from sample submission file
predcols = ["Hazard"]
x.columns = predcols
x = pd.concat([test['Id'], x], axis = 1)
x.head()

Unnamed: 0,Id,Hazard
0,6,2.91065
1,7,5.972525
2,8,6.440152
3,9,3.500306
4,10,3.284669


In [30]:
## Don't keep the indices
## https://stackoverflow.com/questions/16923281/pandas-writing-dataframe-to-csv-file
x.to_csv("submission/submit_1.csv", index=False)