# Otto Group Product Classification Challenge

This project is based on a Kaggle competition titled as above. 
Link: https://www.kaggle.com/c/otto-group-product-classification-challenge

The project utilizes a popular concept in ensembling called Stacking. 
The first layer models in stacking are: Support Vector Classifier, Random Forest, Extra Trees, Neural Network, AdaBoost, Gradient Boost and Logistic Regression.
The second layer model is XGBoost. 

The submission achieved a score of 0.43921

### Performing necessary imports

In [1]:
import xgboost as xgb

In [39]:
import pandas as pd
import os
import numpy as np

In [3]:
import matplotlib.pyplot as plt 

In [4]:
from sklearn import preprocessing 

In [32]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

### Reading the data 

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sampleSubmission.csv')

In [6]:
train.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [7]:
train.describe()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_84,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93
count,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,...,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0
mean,30939.5,0.38668,0.263066,0.901467,0.779081,0.071043,0.025696,0.193704,0.662433,1.011296,...,0.070752,0.532306,1.128576,0.393549,0.874915,0.457772,0.812421,0.264941,0.380119,0.126135
std,17862.784315,1.52533,1.252073,2.934818,2.788005,0.438902,0.215333,1.030102,2.25577,3.474822,...,1.15146,1.900438,2.681554,1.575455,2.115466,1.527385,4.597804,2.045646,0.982385,1.20172
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,15470.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,30939.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,46408.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,61878.0,61.0,51.0,64.0,70.0,19.0,10.0,38.0,76.0,43.0,...,76.0,55.0,65.0,67.0,30.0,61.0,130.0,52.0,19.0,87.0


In [8]:
labels = train.target.values
labels = preprocessing.LabelEncoder().fit_transform(labels)
train = train.drop(["id", "target"], axis=1)
test = test.drop("id", axis = 1)

### Defining the parameters of models

In [103]:
xgb_params = {"objective": "multi:softprob", "eval_metric":"mlogloss", "num_class":9}
svm_params = {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf', 'probability': True}
tree_params ={'n_estimators': 100, 'n_jobs': -1, 'criterion': 'gini'}
nn_params = {'solver': 'lbfgs', 'alpha': 5e-4, 'hidden_layer_sizes': (3), 'random_state': 1, 'max_iter': 500}
#ada_params = {'n_estimators': 500, 'learning_rate' : 0.75}
#gb_params = {'n_estimators': 500,'max_depth': 5, 'min_samples_leaf': 2}

In [58]:
ntrain = train.shape[0]
ntest = test.shape[0]
seed = 0 # for reproducibility
nfolds = 10 # set folds for out-of-fold prediction
nclasses = 9
skf = list(StratifiedKFold(nfolds, random_state=seed).split(train,labels))

### Stacking function

In [66]:
#Code inspired from https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python
#Stacking Function for multiclass classification
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,nclasses))
    oof_test = np.zeros((ntest,nclasses))
    oof_test_skf = np.empty((nfolds*nclasses, ntest))

    for i, (train_index, test_index) in enumerate(skf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.fit(x_tr, y_tr)

        oof_train[test_index,:] = clf.predict_proba(x_te)
        #oof_test_skf[i:i+nclasses-1, :] = clf.predict_proba(x_test)

    oof_test = clf.predict_proba(x_test)
    return oof_train.reshape(-1, nclasses), oof_test.reshape(-1, nclasses)

In [91]:
svm = SVC(**svm_params)
rf = RandomForestClassifier(**tree_params)
xt = ExtraTreesClassifier(**tree_params)
nn = MLPClassifier(**nn_params)
ada = AdaBoostClassifier()
gb = GradientBoostingClassifier()
lr = LogisticRegression()
gbm = xgb.XGBClassifier(**xgb_params)

In [64]:
train = train.values # Creates an numpy array of the train data
test = test.values # Creats an numpy array of the test data

### Training the first layer models in stacking

In [None]:
svm_oof_train, svm_oof_test = get_oof(svm, train, labels, test) # Support Vector Classifier

In [None]:
rf_oof_train, rf_oof_test = get_oof(rf, train, labels, test) # Random Forest

In [101]:
xt_oof_train, et_oof_test = get_oof(xt, train, labels, test) # Extra Trees

In [85]:
nn_oof_train, nn_oof_test = get_oof(nn, train, labels, test) # Neural Network

In [94]:
ada_oof_train, ada_oof_test = get_oof(ada, train, labels, test) # AdaBoost 

In [102]:
gb_oof_train, gb_oof_test = get_oof(gb, train, labels, test) # Gradient Boost

In [96]:
lr_oof_train, lr_oof_tes = get_oof(lr, train, labels, test) # Logistic Regression

In [104]:
x_train = np.concatenate((lr_oof_train, svm_oof_train, rf_oof_train, xt_oof_train, nn_oof_train, ada_oof_train, gb_oof_train), axis=1)
x_test = np.concatenate((lr_oof_test, svm_oof_test, rf_oof_test, et_oof_test, nn_oof_test, ada_oof_test, gb_oof_test), axis=1)

### Performing grid search for the second layer model i.e. XGBoost

In [121]:
from sklearn.model_selection import GridSearchCV
param_grid = {
        #'n_estimators': [100, 250, 500],
        #'learning_rate': [0.05, 0.1, 0.3],
        'max_depth': [6, 9, 12],
        'min_child_weight': [3, 6, 9],
        #'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }

In [122]:
grid = GridSearchCV(gbm,param_grid,verbose=2, scoring='neg_log_loss')

In [123]:
grid.fit(x_train,labels)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] colsample_bytree=0.8, max_depth=6, min_child_weight=3 ...........
[CV]  colsample_bytree=0.8, max_depth=6, min_child_weight=3, total= 7.1min
[CV] colsample_bytree=0.8, max_depth=6, min_child_weight=3 ...........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.2min remaining:    0.0s


[CV]  colsample_bytree=0.8, max_depth=6, min_child_weight=3, total= 6.9min
[CV] colsample_bytree=0.8, max_depth=6, min_child_weight=3 ...........
[CV]  colsample_bytree=0.8, max_depth=6, min_child_weight=3, total= 6.9min
[CV] colsample_bytree=0.8, max_depth=6, min_child_weight=6 ...........
[CV]  colsample_bytree=0.8, max_depth=6, min_child_weight=6, total= 6.7min
[CV] colsample_bytree=0.8, max_depth=6, min_child_weight=6 ...........
[CV]  colsample_bytree=0.8, max_depth=6, min_child_weight=6, total= 6.8min
[CV] colsample_bytree=0.8, max_depth=6, min_child_weight=6 ...........
[CV]  colsample_bytree=0.8, max_depth=6, min_child_weight=6, total= 6.8min
[CV] colsample_bytree=0.8, max_depth=6, min_child_weight=9 ...........
[CV]  colsample_bytree=0.8, max_depth=6, min_child_weight=9, total= 6.6min
[CV] colsample_bytree=0.8, max_depth=6, min_child_weight=9 ...........
[CV]  colsample_bytree=0.8, max_depth=6, min_child_weight=9, total= 6.6min
[CV] colsample_bytree=0.8, max_depth=6, min_child

[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 537.9min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eval_metric='mlogloss', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=None, num_class=9, objective='multi:softprob',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [6, 9, 12], 'min_child_weight': [3, 6, 9], 'colsample_bytree': [0.8, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=2)

In [124]:
grid.best_params_

{'colsample_bytree': 0.8, 'max_depth': 6, 'min_child_weight': 6}

### Predictions on test data

In [125]:
predictions = grid.predict_proba(x_test)

In [126]:
submit = pd.DataFrame(data=predictions, index=sample.id.values, columns=sample.columns[1:])
submit.to_csv('submission.csv', index_label='id')