# Otto Kaggle Competition Modeling and Prediction (2015)

## March 4, 2018

## Hiro Miyake

This notebook deals with data provided in the [Otto Kaggle competition](https://www.kaggle.com/c/otto-group-product-classification-challenge) held in 2015. Exploratory data analysis is performed in the companion notebook.

# 1. Load modules and data

In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import SparsePCA
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import r2_score

from xgboost.sklearn import XGBClassifier
from xgboost.sklearn import XGBRegressor



In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# 2. Look at the data and combine the training and test sets

In [3]:
train.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [4]:
train.tail()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
61873,61874,1,0,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,2,0,Class_9
61874,61875,4,0,0,0,0,0,0,0,0,...,0,2,0,0,2,0,0,1,0,Class_9
61875,61876,0,0,0,0,0,0,0,3,1,...,0,3,1,0,0,0,0,0,0,Class_9
61876,61877,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,3,10,0,Class_9
61877,61878,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,Class_9


In [5]:
test.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_84,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93
0,1,0,0,0,0,0,0,0,0,0,...,0,0,11,1,20,0,0,0,0,0
1,2,2,2,14,16,0,0,0,0,0,...,0,0,0,0,0,4,0,0,2,0
2,3,0,1,12,1,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,0,...,0,3,1,0,0,0,0,0,0,0
4,5,1,0,0,1,0,0,1,2,0,...,0,0,0,0,0,0,0,9,0,0


In [6]:
data = pd.concat([train.drop('target', axis = 1), test], axis = 0)

## Note that in the above concatenation step, the indices are unchanged
## To reset the indices so that they make sense, take the tip from the following link
## and use the following line of code
## https://stackoverflow.com/questions/35084071/concat-dataframe-reindexing-only-valid-with-uniquely-valued-index-objects
data.reset_index(inplace=True, drop=True)

data.head(10)

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_84,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93
0,1,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,1,6,1,5,0,0,...,22,0,1,2,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
5,6,2,1,0,0,7,0,0,0,0,...,1,0,3,0,0,0,0,2,0,0
6,7,2,0,0,0,0,0,0,2,0,...,0,1,1,0,0,0,0,0,0,1
7,8,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
8,9,0,0,0,0,0,0,0,4,0,...,0,0,2,0,0,0,0,0,0,1
9,10,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0


# 3. Dimensionality reduction via principal component analysis

At this point, we could directly feed the training data into a classification model, but that probably won't produce a competitive prediction score. One thing we can do is do principal component analysis to reduce the dimension of our data.

In [7]:
X = data.iloc[:,1:]

#X = StandardScaler().fit_transform(X) ## Subtracts mean and rescales by variance
X = MaxAbsScaler().fit_transform(X) ## Scales max value to 1.0

#pca = SparsePCA(n_components=500)
pca = PCA(n_components=62, svd_solver = 'randomized')
X = pca.fit(X).transform(X)
#print pca.explained_variance_ratio_
print 'Percent of variance explained: ' + str(100*sum(pca.explained_variance_ratio_)) +'%'

X = pd.DataFrame(X)
X.head()
#X.describe()

Percent of variance explained: 95.233173125%


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
0,-0.080388,-0.054888,0.000263,0.116711,-0.138912,-0.157098,-0.071782,-0.010453,-0.055839,0.052705,...,-0.003398,-0.006944,-0.001317,0.02032,0.007817,-0.015948,-0.02623,0.00423,-0.012476,0.000916
1,-0.042854,-0.085779,-0.044636,-0.031031,-0.04456,0.023347,0.018084,0.011597,0.020912,-0.010579,...,-0.002228,-0.002333,-0.004145,0.00241,-0.003947,0.001274,0.001632,0.001707,-0.006285,-0.007666
2,-0.034233,-0.081152,-0.049383,-0.021483,-0.038846,0.03264,0.025734,-0.001056,0.01159,-0.011953,...,0.000983,0.001529,-0.004728,0.009682,0.002837,0.00086,-0.005345,0.00034,-0.016025,-0.004769
3,-0.057572,-0.015964,0.012541,0.0385,0.017188,0.060975,0.036503,-0.053254,0.010033,0.051078,...,-0.000334,0.034753,0.046969,-0.001832,-0.097993,-0.045172,0.043929,-0.052665,-0.007156,-0.011898
4,-0.040299,-0.082717,-0.046721,-0.017084,-0.046673,0.033739,0.045862,-0.01143,0.003178,-0.003637,...,0.011387,0.003588,-0.022563,0.040924,0.003033,-0.021935,-0.006573,-0.002763,-0.010454,0.01116


We see that 62 of the top principal components explains 95% of the variance of the data.

In [8]:
data_f = pd.concat([data['id'], X], axis=1)
data_f.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,52,53,54,55,56,57,58,59,60,61
0,1,-0.080388,-0.054888,0.000263,0.116711,-0.138912,-0.157098,-0.071782,-0.010453,-0.055839,...,-0.003398,-0.006944,-0.001317,0.02032,0.007817,-0.015948,-0.02623,0.00423,-0.012476,0.000916
1,2,-0.042854,-0.085779,-0.044636,-0.031031,-0.04456,0.023347,0.018084,0.011597,0.020912,...,-0.002228,-0.002333,-0.004145,0.00241,-0.003947,0.001274,0.001632,0.001707,-0.006285,-0.007666
2,3,-0.034233,-0.081152,-0.049383,-0.021483,-0.038846,0.03264,0.025734,-0.001056,0.01159,...,0.000983,0.001529,-0.004728,0.009682,0.002837,0.00086,-0.005345,0.00034,-0.016025,-0.004769
3,4,-0.057572,-0.015964,0.012541,0.0385,0.017188,0.060975,0.036503,-0.053254,0.010033,...,-0.000334,0.034753,0.046969,-0.001832,-0.097993,-0.045172,0.043929,-0.052665,-0.007156,-0.011898
4,5,-0.040299,-0.082717,-0.046721,-0.017084,-0.046673,0.033739,0.045862,-0.01143,0.003178,...,0.011387,0.003588,-0.022563,0.040924,0.003033,-0.021935,-0.006573,-0.002763,-0.010454,0.01116


# 4. Split the data back into training and test sets

Rename the column names to consecutive numbers. This seems to solve a weird issue with xgboost.

In [9]:
dfdim = data_f.shape
data_f.columns = range(dfdim[1])
data_f.rename(columns={0: 'id'}, inplace=True)
data_f.head()

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,53,54,55,56,57,58,59,60,61,62
0,1,-0.080388,-0.054888,0.000263,0.116711,-0.138912,-0.157098,-0.071782,-0.010453,-0.055839,...,-0.003398,-0.006944,-0.001317,0.02032,0.007817,-0.015948,-0.02623,0.00423,-0.012476,0.000916
1,2,-0.042854,-0.085779,-0.044636,-0.031031,-0.04456,0.023347,0.018084,0.011597,0.020912,...,-0.002228,-0.002333,-0.004145,0.00241,-0.003947,0.001274,0.001632,0.001707,-0.006285,-0.007666
2,3,-0.034233,-0.081152,-0.049383,-0.021483,-0.038846,0.03264,0.025734,-0.001056,0.01159,...,0.000983,0.001529,-0.004728,0.009682,0.002837,0.00086,-0.005345,0.00034,-0.016025,-0.004769
3,4,-0.057572,-0.015964,0.012541,0.0385,0.017188,0.060975,0.036503,-0.053254,0.010033,...,-0.000334,0.034753,0.046969,-0.001832,-0.097993,-0.045172,0.043929,-0.052665,-0.007156,-0.011898
4,5,-0.040299,-0.082717,-0.046721,-0.017084,-0.046673,0.033739,0.045862,-0.01143,0.003178,...,0.011387,0.003588,-0.022563,0.040924,0.003033,-0.021935,-0.006573,-0.002763,-0.010454,0.01116


Note from near the beginning that 61877 is the last index of the training set.

In [10]:
train_f = data_f.iloc[:61878,:]
test_f = data_f.iloc[61878:,:]

In [11]:
train_f.tail()

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,53,54,55,56,57,58,59,60,61,62
61873,61874,-0.000156,0.285013,0.103947,0.408947,0.105172,-0.064181,0.51453,-0.102922,-0.084646,...,-0.028118,0.011719,0.012257,0.019661,-0.02889,-0.054981,-0.007047,0.015395,-0.02526,-0.046039
61874,61875,0.027126,-0.05445,-0.003212,0.036023,-0.021657,0.027721,0.108835,-0.07944,-0.070588,...,0.009891,0.000336,-0.008338,-0.002843,0.017462,0.007525,0.009587,0.005233,0.010318,-0.006377
61875,61876,-0.005908,-0.046798,-0.068329,0.021501,0.002643,0.030634,0.052818,-0.024701,-0.049909,...,0.00145,-0.027726,0.019047,-0.028492,0.000814,-0.044026,-0.016314,0.005915,-0.008943,0.022589
61876,61877,-0.044961,-0.011202,-0.007306,0.048706,-0.055011,-0.047318,0.174984,0.004296,-0.04022,...,-0.035482,0.030898,0.002228,-0.014247,0.012011,-0.001542,-0.030457,0.016318,0.010199,0.003899
61877,61878,-0.063456,-0.007543,-0.017685,0.086339,-0.047817,-0.043247,0.078775,-0.014699,-0.037719,...,0.001321,-0.024996,-0.008932,0.040444,-0.000226,0.004508,-0.005932,-0.00319,0.029146,-0.029206


In [12]:
test_f.head()

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,53,54,55,56,57,58,59,60,61,62
61878,1,0.169923,-0.018631,0.010947,-0.026279,0.043012,0.030702,-0.128572,-0.208779,0.278977,...,0.004137,0.008532,-0.005558,0.002596,-0.007318,-0.008223,0.012361,-0.002858,-0.000681,-0.001773
61879,2,-0.164093,0.109485,0.088914,-0.034438,0.047012,-0.0522,0.043509,-0.010193,0.046293,...,-0.047771,0.001986,0.013338,0.001199,0.037975,0.001855,0.033542,0.004787,-0.007739,0.020693
61880,3,-0.208948,0.161301,0.149074,-0.061266,0.038401,-0.014625,-0.014028,-0.013389,0.025226,...,-0.044916,-0.033706,-0.00423,-0.002491,-0.044439,-0.006116,-0.003605,-0.029677,-0.007204,0.029819
61881,4,0.087903,0.341569,-0.317367,-0.06577,-0.024906,0.01405,0.007999,-0.090497,-0.081737,...,-0.005367,-0.01461,-0.012741,0.036775,-0.011929,0.024673,-0.004937,-0.025169,-0.004078,-0.014594
61882,5,-0.044932,-0.093634,-0.048604,-0.002066,-0.029681,0.062849,0.017379,0.012296,-0.022797,...,-0.014766,0.003768,-0.009235,-0.012583,0.00836,0.016857,0.007381,0.023707,-0.031871,-0.000624


Add the `target` variable back to the training set.

In [13]:
train_f2 = pd.merge(train[['id', 'target']], train_f, on = 'id')
train_f2.head()

Unnamed: 0,id,target,1,2,3,4,5,6,7,8,...,53,54,55,56,57,58,59,60,61,62
0,1,Class_1,-0.080388,-0.054888,0.000263,0.116711,-0.138912,-0.157098,-0.071782,-0.010453,...,-0.003398,-0.006944,-0.001317,0.02032,0.007817,-0.015948,-0.02623,0.00423,-0.012476,0.000916
1,2,Class_1,-0.042854,-0.085779,-0.044636,-0.031031,-0.04456,0.023347,0.018084,0.011597,...,-0.002228,-0.002333,-0.004145,0.00241,-0.003947,0.001274,0.001632,0.001707,-0.006285,-0.007666
2,3,Class_1,-0.034233,-0.081152,-0.049383,-0.021483,-0.038846,0.03264,0.025734,-0.001056,...,0.000983,0.001529,-0.004728,0.009682,0.002837,0.00086,-0.005345,0.00034,-0.016025,-0.004769
3,4,Class_1,-0.057572,-0.015964,0.012541,0.0385,0.017188,0.060975,0.036503,-0.053254,...,-0.000334,0.034753,0.046969,-0.001832,-0.097993,-0.045172,0.043929,-0.052665,-0.007156,-0.011898
4,5,Class_1,-0.040299,-0.082717,-0.046721,-0.017084,-0.046673,0.033739,0.045862,-0.01143,...,0.011387,0.003588,-0.022563,0.040924,0.003033,-0.021935,-0.006573,-0.002763,-0.010454,0.01116


I probably need to turn the target variable, which is currently a string, into a number.

In [14]:
train_f2['target'] = train_f2['target'].apply(lambda x: int(x.split('_')[1]))
train_f2.head()

Unnamed: 0,id,target,1,2,3,4,5,6,7,8,...,53,54,55,56,57,58,59,60,61,62
0,1,1,-0.080388,-0.054888,0.000263,0.116711,-0.138912,-0.157098,-0.071782,-0.010453,...,-0.003398,-0.006944,-0.001317,0.02032,0.007817,-0.015948,-0.02623,0.00423,-0.012476,0.000916
1,2,1,-0.042854,-0.085779,-0.044636,-0.031031,-0.04456,0.023347,0.018084,0.011597,...,-0.002228,-0.002333,-0.004145,0.00241,-0.003947,0.001274,0.001632,0.001707,-0.006285,-0.007666
2,3,1,-0.034233,-0.081152,-0.049383,-0.021483,-0.038846,0.03264,0.025734,-0.001056,...,0.000983,0.001529,-0.004728,0.009682,0.002837,0.00086,-0.005345,0.00034,-0.016025,-0.004769
3,4,1,-0.057572,-0.015964,0.012541,0.0385,0.017188,0.060975,0.036503,-0.053254,...,-0.000334,0.034753,0.046969,-0.001832,-0.097993,-0.045172,0.043929,-0.052665,-0.007156,-0.011898
4,5,1,-0.040299,-0.082717,-0.046721,-0.017084,-0.046673,0.033739,0.045862,-0.01143,...,0.011387,0.003588,-0.022563,0.040924,0.003033,-0.021935,-0.006573,-0.002763,-0.010454,0.01116


# 5. Modeling and prediction

In [15]:
## For continuous variable prediction
#train_train, train_test = train_test_split(train_f2, train_size=0.7, 
#                                                            random_state=0)

## For discrete variable prediction
train_train, train_test = train_test_split(train_f2, train_size=0.7, 
                                                             random_state=0, stratify = train_f2['target'])

In [16]:
## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
## Smaller C => stronger regularization. 10000 and 1000 makes no difference.
#model = linear_model.LinearRegression()
#model = linear_model.LogisticRegression(C = 10000, solver = 'sag', multi_class = 'multinomial', max_iter = 500)

## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
## max_depth controls for regularization; smaller the more regularization
#model = RandomForestClassifier(max_depth=5, random_state=0)
#model = RandomForestClassifier(max_depth = 30, random_state=0)
#model = RandomForestRegressor(max_depth = 5, random_state = 0)

## http://xgboost.readthedocs.io/en/latest/parameter.html
## http://xgboost.readthedocs.io/en/latest/python/python_api.html
#model = XGBClassifier(max_depth=10, learning_rate=1.0, n_estimators=100,
#                    objective='binary:logistic', subsample=1.0, colsample_bytree=0.6, seed=0)
model = XGBClassifier(max_depth=10, learning_rate=1.0, n_estimators=100,
                    objective='binary:logistic', subsample=1.0, colsample_bytree=0.6, seed=0, reg_lambda = 1000)
#model = XGBRegressor(max_depth=10, learning_rate=1.0, n_estimators=100,
#                    objective='reg:linear', subsample=1.0, colsample_bytree=0.6, seed=0, reg_lambda = 9000)

In [17]:
model.fit(train_train.iloc[:, 2:], train_train['target'])

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.6,
       gamma=0, learning_rate=1.0, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1000,
       scale_pos_weight=1, seed=0, silent=True, subsample=1.0)

In [18]:
## http://scikit-learn.org/stable/modules/model_evaluation.html

#pred_train = model.predict(train_train.iloc[:, 2:])
pred_train = model.predict_proba(train_train.iloc[:, 2:])
score = log_loss(train_train['target'], pred_train)
print 'Score for the training set: ' + str(score)

#pred_train = model.predict(train_test.iloc[:, 2:])
pred_train = model.predict_proba(train_test.iloc[:, 2:])
score = log_loss(train_test['target'], pred_train)
print 'Score for the validation set: ' + str(score)

Score for the training set: 0.23896514743
Score for the validation set: 0.565067104309


In [19]:
#x = model.predict(test_f.iloc[:,1:])
x = model.predict_proba(test_f.iloc[:,1:])
x = pd.DataFrame(x)

## Copy and paste column names from sample submission file
predcols = ['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']
x.columns = predcols
x = pd.concat([test['id'], x], axis = 1)
x.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,1,0.003744,0.445139,0.277068,0.227117,0.000644,0.00221,0.041726,0.000964,0.001388
1,2,0.001565,0.002464,0.002902,0.000941,0.000227,0.872308,0.003254,0.11378,0.002559
2,3,0.000104,0.000187,0.000377,3.3e-05,5.2e-05,0.99831,0.000486,0.000327,0.000125
3,4,0.003777,0.599721,0.318939,0.061567,0.000924,0.003029,0.003591,0.006382,0.00207
4,5,0.310919,0.00099,0.00082,0.000589,0.000265,0.005746,0.002929,0.161685,0.516056


In [20]:
## Don't keep the indices
## https://stackoverflow.com/questions/16923281/pandas-writing-dataframe-to-csv-file
x.to_csv("submission/submit_1.csv", index=False)