## Initialize notebook

In [142]:
import numpy as np
import pandas as pd
import datetime
from time import time
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.sparse import coo_matrix, hstack, csr_matrix, csc_matrix
from scipy.stats import randint, norm, lognorm, truncnorm
from scipy.stats.mstats import gmean
from operator import itemgetter
import random

from sklearn import cluster, mixture, metrics, cross_validation
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV

## Define custom functions

In [103]:
def correct_age(val):
  if val > 1900:
    val = 2014 - val
  if val < 18 or val > 100:
    val = -1
  return val

def process_df(df):
  age = np.array(df['age'])
  age = np.array(map(correct_age, age))
  df['age'] = age
  df['date_account_created'] = pd.to_datetime(df['date_account_created'])
  df['date_first_booking'] = pd.to_datetime(df['date_first_booking'])
  df['timestamp_first_active'] = pd.to_datetime(df['timestamp_first_active'], format='%Y%m%d%H%M%S')
  return df

def build_feature_matrix(df_train, df_test, df_sessions_feats):
  idx_train = len(df_train)
  df_all = pd.concat([process_df(df_train), process_df(df_test)], ignore_index=True)
  X = df_all
  X = X.drop(['date_account_created','timestamp_first_active','date_first_booking','country_destination'], axis=1)
  # date_account_created
  X['c_year'] = pd.DatetimeIndex(df_all['date_account_created']).year
  X['c_month'] = pd.DatetimeIndex(df_all['date_account_created']).month
  X['c_day'] = pd.DatetimeIndex(df_all['date_account_created']).day
  # timestamp_first_active
  X['a_year'] = pd.DatetimeIndex(df_all['timestamp_first_active']).year
  X['a_month'] = pd.DatetimeIndex(df_all['timestamp_first_active']).month
  X['a_day'] = pd.DatetimeIndex(df_all['timestamp_first_active']).day
  X['a_hour'] = pd.DatetimeIndex(df_all['timestamp_first_active']).hour
  # days difference
  X['days_diff'] = (df_all['date_account_created'] - df_all['timestamp_first_active']).astype('timedelta64[D]')
  # merge with secs_elapsed
  X = X.set_index('id')
  X = pd.merge(X, df_sessions_feats, how='left', left_index=True, right_index=True)
  X = X.fillna(value=-1)

  # One-hot-encoding features
  ohe_feats = ['gender', 'language', 'signup_method', 'signup_flow', 'signup_app', \
               'affiliate_provider', 'affiliate_channel', 'first_affiliate_tracked', \
               'first_device_type', 'first_browser']
  for f in ohe_feats:
    X_dummy = pd.get_dummies(X[f], prefix=f)
    X = X.drop([f], axis=1)
    X = pd.concat((X, X_dummy), axis=1)
  
  X_mat = coo_matrix(X)
  
  p_sel = 0.99
  selector = VarianceThreshold(threshold=p_sel*(1 - p_sel))
  X_mat = selector.fit_transform(X_mat)  

  X_train = X_mat.tocsr()[:idx_train, :]
  X_test = X_mat.tocsr()[idx_train:len(X), :]
  return X_train, X_test

def build_label_matrix(df):
  y = pd.DataFrame(df['id'])
  le = LabelEncoder()
  y['country_destination'] = le.fit_transform(df['country_destination'])
  y = y.set_index('id')
  return y, le

In [66]:
def compute_error(clf, X, y, n_iter=20, scoring='log_loss', subsample_factor=1.):
#   cv = cross_validation.ShuffleSplit(len(y), n_iter=n_iter, train_size=0.5,
#                                      test_size=0.5, random_state=42)
  cv = cross_validation.StratifiedShuffleSplit(y, n_iter=n_iter, train_size=0.8/subsample_factor,
                                               test_size=0.2/subsample_factor, random_state=42)
  return -cross_validation.cross_val_score(clf, X, y, cv=cv, scoring='log_loss').mean()

def dcg_at_k(r, k, method=1):
  r = np.asfarray(r)[:k]
  if r.size:
    if method == 0:
      return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
    elif method == 1:
      return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    else:
      raise ValueError('method must be 0 or 1.')
  return 0.

def ndcg_at_k(r, k=5, method=1):
  dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
  if not dcg_max:
    return 0.
  return dcg_at_k(r, k, method) / dcg_max

def score_predictions(preds, truth, n_modes=5):
  """
  preds: pd.DataFrame
  truth: pd.Series
  """
  assert(len(preds)==len(truth))
  r = pd.DataFrame(0, index=preds.index, columns=preds.columns, dtype=np.float64)
  for col in preds.columns:
    r[col] = (preds[col] == truth) * 1.0

  score = pd.Series(r.apply(ndcg_at_k, axis=1, reduce=True), name='score')
  return np.mean(score)

def compute_ncdg_score(clf, X, y, n_iter=20, subsample_factor=1.):
  score_array = []
  for ii in range(n_iter):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2/subsample_factor,
                                                                         train_size=0.8/subsample_factor,
                                                                         random_state=ii)
    clf_fit = clf.fit(X_train,y_train)
    y_pred_proba = clf_fit.predict_proba(X_test)
    y_pred_proba = np.fliplr(np.argsort(y_pred_proba))
    preds = pd.DataFrame(y_pred_proba[:,:5])
    truth = pd.Series(y_test)
    score_array.append(score_predictions(preds,truth))
  return np.mean(score_array)

def ncdg_score_fun(truth, preds):
  truth = pd.Series(truth)
  preds = pd.DataFrame(preds)
  
  assert(len(preds)==len(truth))
  r = pd.DataFrame(0, index=preds.index, columns=preds.columns, dtype=np.float64)
  for col in preds.columns:
    r[col] = (preds[col] == truth) * 1.0

  return np.mean(pd.Series(r.apply(ndcg_at_k, axis=1, reduce=True), name='score'))

# Utility function to report best scores
def grid_search_report(grid_scores, n_top=3):
  top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
  for i, score in enumerate(top_scores):
    print('Model with rank: {0}'.format(i + 1))
    print('Mean validation score: {0:.3f} (std: {1:.3f})'.format(
          score.mean_validation_score,
          np.std(score.cv_validation_scores)))
    print('Parameters: {0}'.format(score.parameters))
    print('')

## Load data

In [98]:
df_train = pd.read_csv('train_users.csv')
df_test = pd.read_csv('test_users.csv')
df_sessions = pd.read_csv('sessions.csv')
df_train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [5]:
pd.value_counts(df_train['country_destination'])/len(df_train)

NDF      0.583461
US       0.292225
other    0.047291
FR       0.023531
IT       0.013285
GB       0.010892
ES       0.010536
CA       0.006690
DE       0.004970
NL       0.003574
AU       0.002525
PT       0.001021
dtype: float64

In [99]:
df_sessions_feats = pd.get_dummies(df_sessions['action_type'], prefix='action_type')
df_sessions_feats['id'] = df_sessions['user_id']
df_sessions_feats = df_sessions_feats.groupby('id').sum()
df_sessions_feats['secs_elapsed_sum'] = df_sessions.groupby('user_id').sum()
df_sessions_feats['secs_elapsed_mean'] = df_sessions.groupby('user_id').mean()
df_sessions_feats['secs_elapsed_std'] = df_sessions.groupby('user_id').std()

ohe_feats = ['action', 'action_detail', 'device_type']
for f in ohe_feats:
  tmp = pd.get_dummies(df_sessions[f], prefix=f)
  tmp['id'] = df_sessions['user_id']
  tmp = tmp.groupby('id').sum()
  df_sessions_feats = pd.merge(tmp, df_sessions_feats, how='left', left_index=True, right_index=True)

df_sessions_feats.head()

Unnamed: 0_level_0,device_type_-unknown-,device_type_Android App Unknown Phone/Tablet,device_type_Android Phone,device_type_Blackberry,device_type_Chromebook,device_type_Linux Desktop,device_type_Mac Desktop,device_type_Opera Phone,device_type_Tablet,device_type_Windows Desktop,...,action_type_click,action_type_data,action_type_message_post,action_type_modify,action_type_partner_callback,action_type_submit,action_type_view,secs_elapsed_sum,secs_elapsed_mean,secs_elapsed_std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00023iyk9l,0,0,0,0,0,0,36,0,0,0,...,4,9,1,0,1,0,21,867896,22253.74359,92242.5619
0010k6l0om,0,0,0,0,0,0,63,0,0,0,...,16,9,0,0,1,0,17,586543,9460.370968,22751.227918
001wyh0pz8,0,90,0,0,0,0,0,0,0,0,...,66,2,0,0,0,3,8,282965,3179.382022,6569.648182
0028jgx1x1,30,0,1,0,0,0,0,0,0,0,...,9,5,0,0,0,1,15,297010,9900.333333,18004.964337
002qnbzfs5,14,0,0,0,0,0,0,0,0,0,...,140,140,16,0,0,15,216,6487080,8232.335025,58110.64617


## Create training and testing matrices

In [104]:
X_train, X_test = build_feature_matrix(df_train, df_test, df_sessions_feats)
df_y_train, le = build_label_matrix(df_train)
y_train = np.ravel(np.array(df_y_train))

print 'X_train shape: (%i, %i)' % (np.shape(X_train)[0], np.shape(X_train)[1])
print 'X_test shape: (%i, %i)' % (np.shape(X_test)[0], np.shape(X_test)[1])

X_train shape: (213466, 598)
X_test shape: (62096, 598)


## Grid search

In [128]:
clf = XGBClassifier(nthread=4, objective='multi:softprob', seed=42)
my_scorer = metrics.make_scorer(ncdg_score_fun, greater_is_better=True)

# specify parameters and distributions to sample from
param_dist = {'n_estimators': randint(25, 500),
              'max_depth': randint(3, 11),
              'learning_rate': truncnorm(0.001, 0.3),
              'subsample': truncnorm(0.01, 1.0),
              'colsample_bytree': truncnorm(0.01, 1.0)}

# run randomized search
n_iter_search = 100
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                                   n_iter=n_iter_search, scoring=my_scorer,
                                   n_jobs=4, random_state=42, cv=3)

subsample_factor = 50
X_train_s, X_test_s, y_train_s, y_test_s = cross_validation.train_test_split(X_train, y_train, 
                                                                             test_size=0.2/subsample_factor,
                                                                             train_size=0.8/subsample_factor, 
                                                                             random_state=42)

start = time()
random_search.fit(X_train_s, y_train_s)
print('RandomizedSearchCV took %.2f seconds for %d candidates'
      ' parameter settings.' % ((time() - start), n_iter_search))
grid_search_report(random_search.grid_scores_)

RandomizedSearchCV took 4409.51 seconds for 100 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.650 (std: 0.010)
Parameters: {'n_estimators': 115, 'subsample': 0.33574029503278985, 'learning_rate': 0.0094177749627677252, 'colsample_bytree': 0.9538601586818044, 'max_depth': 3}

Model with rank: 2
Mean validation score: 0.646 (std: 0.007)
Parameters: {'n_estimators': 70, 'subsample': 0.7892882146912904, 'learning_rate': 0.030520673447458131, 'colsample_bytree': 0.63226230004617046, 'max_depth': 4}

Model with rank: 3
Mean validation score: 0.646 (std: 0.007)
Parameters: {'n_estimators': 156, 'subsample': 0.48777515588825809, 'learning_rate': 0.017215698811119687, 'colsample_bytree': 0.54157310838962647, 'max_depth': 4}



## Train model, compute error, and predict

In [None]:
0.01: 0.796190
0.08: 0.796746
0.06: 0.799226
0.04: 0.803207
0.02: 0.804160
  
0.830044

In [149]:
clf = XGBClassifier(max_depth=8, learning_rate=0.1, n_estimators=100,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=42)
# clf = CalibratedClassifierCV(clf, method='isotonic', cv=5)

# top_scores = sorted(random_search.grid_scores_, key=itemgetter(1), reverse=True)
# clf = XGBClassifier(nthread=4, objective='multi:softprob', seed=42, **top_scores[2].parameters)

# start = time()
# # score = compute_error(clf, X_train, y_train, n_iter=5, scoring='log_loss', subsample_factor=50)
# score = compute_ncdg_score(clf, X_train, y_train, n_iter=5, subsample_factor=20)
# print 'CV took %.2f seconds' % (time() - start)
# print 'NCDG score: %.6f' % score

In [150]:
start = time()

clf_fit = clf.fit(X_train, y_train)
# y_pred = clf_fit.predict(X_test)
y_pred_proba = clf_fit.predict_proba(X_test)
y_pred_proba = np.fliplr(np.argsort(y_pred_proba))

print 'Train/predict took %.2f seconds' % (time() - start)

Train/predict took 21819.34 seconds


## Convert to submission

In [151]:
# convert top 5 probabilities to dataframe
id_test = list(df_test['id'])
id_array = []
country_array = []
for ii in range(len(id_test)):
  idx = id_test[ii]
  id_array += [idx]*5
  country_array += list(le.inverse_transform(y_pred_proba[ii,:5]))
  
df_pred = pd.DataFrame(np.column_stack((id_array, country_array)), columns=['id', 'country'])
df_pred.head()

Unnamed: 0,id,country
0,5uwns89zht,NDF
1,5uwns89zht,US
2,5uwns89zht,other
3,5uwns89zht,FR
4,5uwns89zht,IT


In [152]:
pd.value_counts(df_pred['country'])/len(df_pred)

other    0.200000
FR       0.200000
NDF      0.200000
US       0.200000
IT       0.161550
GB       0.024771
ES       0.011257
PT       0.002351
CA       0.000061
NL       0.000010
dtype: float64

In [153]:
df_pred.to_csv('my_submission.csv', sep=',', index=False)