## Initialize notebook

In [1]:
import numpy as np
import pandas as pd
import datetime

from time import time
import matplotlib.pyplot as plt

from xgboost.sklearn import XGBClassifier
from scipy.sparse import coo_matrix, hstack, csr_matrix, csc_matrix
from scipy.stats import randint as sp_randint
from operator import itemgetter
import random

from sklearn import cluster, mixture, metrics, cross_validation
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

## Load data

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_sample_sub = pd.read_csv('sample_submission.csv')
df_train.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


## Create feature and label matrices

In [3]:
df_all = df_train
df_all = df_all.drop('TripType', axis=1)
idx_train = len(df_train[['TripType', 'VisitNumber']].groupby('VisitNumber', sort=False).first())

df_all = pd.concat([df_all, df_test], ignore_index=True)
df_all = df_all.fillna(value=-1)
df_all_grouped = df_all.groupby('VisitNumber', sort=False)

le = LabelEncoder()
df_triptype_grouped = df_train[['TripType', 'VisitNumber']].groupby('VisitNumber', sort=False).first()
y_train = pd.Series(le.fit_transform(df_triptype_grouped['TripType']))

In [4]:
def create_dict_vect(df, f, v):
  f_dict = []
  for g in df:
    tmp_dict = {}
    for k in g[1][f]:
      if tmp_dict.has_key(k):
        tmp_dict[k] += 1
      else:
        tmp_dict[k] = 1
    f_dict.append(tmp_dict)
  f_dict_vect = v.fit_transform(f_dict)
  return f_dict_vect

In [6]:
X_all = coo_matrix(df_all[['VisitNumber', 'Upc']].groupby('VisitNumber', sort=False).first())
v = DictVectorizer()
ohe_feats = ['Weekday', 'ScanCount', 'DepartmentDescription', 'FinelineNumber', 'Upc']
for f in ohe_feats:
  f_dict_vect = create_dict_vect(df_all_grouped, f, v)
  X_all = hstack((X_all, f_dict_vect)) 

p_sel = 0.9999
selector = VarianceThreshold(threshold=p_sel*(1. - p_sel))
X_all = selector.fit_transform(X_all)  
  
X_train = X_all.tocsr()[:idx_train, :]
X_test = X_all.tocsr()[idx_train:np.shape(X_all)[0], :]

print 'X_train shape: (%i, %i)' % (np.shape(X_train)[0], np.shape(X_train)[1])
print 'X_test shape: (%i, %i)' % (np.shape(X_test)[0], np.shape(X_test)[1])

X_train shape: (95674, 17638)
X_test shape: (95674, 17638)


## Grid Search

In [50]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

clf = XGBClassifier(nthread=4, objective='multi:softprob', seed=42)
# specify parameters and distributions to sample from
param_dist = {"n_estimators": np.array(range(1,10))*100,
              "max_depth": np.array(range(3,10)),
              "learning_rate": [0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35],
              "subsample": np.array(range(1,10))/10.,
              "colsample_bytree": np.array(range(1,10))/10.}

# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, scoring='log_loss',
                                   n_jobs=4)

start = time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

RandomizedSearchCV took 43258.27 seconds for 30 candidates parameter settings.
Model with rank: 1
Mean validation score: -0.738 (std: 0.022)
Parameters: {'n_estimators': 700, 'subsample': 0.90000000000000002, 'learning_rate': 0.05, 'colsample_bytree': 0.5, 'max_depth': 9}

Model with rank: 2
Mean validation score: -0.740 (std: 0.023)
Parameters: {'n_estimators': 700, 'subsample': 0.80000000000000004, 'learning_rate': 0.15, 'colsample_bytree': 0.59999999999999998, 'max_depth': 4}

Model with rank: 3
Mean validation score: -0.745 (std: 0.024)
Parameters: {'n_estimators': 600, 'subsample': 0.90000000000000002, 'learning_rate': 0.1, 'colsample_bytree': 0.5, 'max_depth': 8}



## CV, train model, and predict

In [19]:
def compute_error(clf, X, y, n_iter=20):
  cv = cross_validation.StratifiedShuffleSplit(y, n_iter=n_iter, train_size=0.5,
                                               test_size=0.5, random_state=42)
  return -cross_validation.cross_val_score(clf, X, y, cv=cv, scoring='log_loss').mean()

In [56]:
clf = XGBClassifier(max_depth=9, learning_rate=0.05, n_estimators=700, nthread=4,
                    objective='multi:softprob', subsample=0.9, colsample_bytree=0.5, seed=42)

# claibration did not help LB score
# clf = CalibratedClassifierCV(clf, method='isotonic', cv=3)

compute_error(clf, X_train, y_train, n_iter=3)

In [23]:
# check to see how well RF can do
clf_rf = RandomForestClassifier(n_estimators=200, n_jobs=4)
compute_error(clf_rf, X_train, y_train, n_iter=3)

1.1467961166451774

In [57]:
clf_fit = clf.fit(X_train,y_train)
y_pred_proba = clf_fit.predict_proba(X_test)

## Convert to submission

In [58]:
col_names = ['TripType_%i' % val for val in le.inverse_transform(clf_fit.classes_)]
df_pred = pd.DataFrame(y_pred_proba, columns=col_names)

tmp = df_test[['VisitNumber', 'Weekday']].groupby('VisitNumber').first()
df_pred.insert(0, 'VisitNumber', tmp.index)
df_pred.head()

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,5e-06,4.465438e-06,8.8e-05,1e-05,0.001075,0.005318,0.007061,0.0001754568,2.198787e-06,...,0.000205,0.000404,0.211743,0.023643,0.0001200963,0.004206564,0.00425,0.0016,0.0007226085,0.001359
1,2,1.7e-05,3.848299e-05,0.000837,0.000248,0.024174,0.099505,0.025481,0.001419016,0.0001317934,...,0.004234,0.001293,0.021121,0.257103,0.0003874413,0.002061556,0.011507,0.017898,0.001232277,0.032326
2,3,1e-06,1.036443e-07,8e-06,4e-06,8.5e-05,0.002931,0.000526,6.809674e-07,5.236754e-08,...,0.000482,2e-06,1.2e-05,3.1e-05,1.772846e-06,9.473791e-07,6e-06,2e-06,9.304792e-07,0.995844
3,4,9e-06,3.836252e-06,0.000293,0.000251,0.004038,0.074584,0.891093,2.097186e-05,6.797906e-06,...,0.000219,6.3e-05,0.000366,0.000391,3.079999e-05,3.487444e-05,0.000927,6.4e-05,2.618969e-05,0.013701
4,6,1e-06,7.722124e-08,4e-06,4e-06,0.000103,0.000305,0.003318,7.276828e-07,4.027261e-08,...,4e-06,1e-06,8e-06,9e-06,7.525238e-07,2.101111e-06,1.4e-05,2e-06,1.198439e-06,0.995787


In [59]:
df_pred.to_csv('my_submission.csv', sep=',', index=False)