In [1]:
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import ml_metrics as metrics

# Load datasets

In [115]:
# 24 raw features in train set, and 22 raw features in test set(is_booking, cnt, hotel clusters)
#'user_location_country' : np.str_, # The ID of the country the customer is located
#'user_location_region' : np.str_, # The ID of the region the customer is located
#'user_location_city' : np.str_, # The ID of the city the customer is located
#'orig_destination_distance':np.float64, # Physical distance between a hotel and a customer at the time of search. A null means the distance could not be calculated
#'channel' : np.str_, #	ID of a marketing channel
#'srch_ci' : np.str_, # Checkin date
#'srch_co' : np.str_, # Checkout date
#'srch_adults_cnt' : np.int32, # The number of adults specified in the hotel room
#'srch_children_cnt' : np.int32, # The number of (extra occupancy) children specified in the hotel room
#'srch_rm_cnt' : np.int32, # The number of hotel rooms specified in the search
#'srch_destination_id' : np.str_, # ID of the destination where the hotel search was performed
#'srch_destination_type_id' : np.str_, # Type of destination
#'hotel_continent' : np.str_, # Hotel continent
#'hotel_country' : np.str_, # Hotel country
#'hotel_market' : np.str_, # Hotel market
#'is_booking': bool, # 1 if a booking, 0 if a click not included in the test set
#'hotel_cluster' : np.str_, not included in test set
#'date_time': np.str_, # ID of a hotel cluster
# site_name : ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, ...) : int
# posa_continent : ID of continent associated with site_name : int
# user_id : ID of users : int
# is_mobile: 1 when a user connected from a mobile device, 0 otherwise : tinyint
# is_package : 1 if the click/booking was generated as a part of a package (i.e. combined with a flight), 0 otherwise : int
# cnt : Numer of similar events in the context of the same user session : bigint not included in test set
# id : only shown in test set

In [116]:
train = pd.read_csv("./train.csv")

In [117]:
# remove cnt not shown in test set
train.drop(['cnt'], inplace=True, axis=1)

In [119]:
train.shape

(37670293, 23)

In [120]:
test = pd.read_csv("./test.csv")

In [121]:
# remove id in test set
test.drop(['id'], inplace=True, axis=1)

In [122]:
test.shape

(2528243, 21)

In [320]:
train["hotel_cluster"].value_counts()

91    1043720
41     772743
48     754033
64     704734
65     670960
5      620194
98     589178
59     570291
42     551605
21     550092
70     545572
18     545284
83     534132
46     534038
25     530591
62     518809
95     509266
28     507016
68     503797
82     503755
37     496061
50     489892
30     489287
9      488328
58     483253
97     479446
16     477868
72     457463
1      452694
99     444887
       ...   
19     282893
84     278264
66     273505
38     269246
87     260398
23     259233
12     259022
31     257587
67     255946
43     253578
7      252447
54     250745
92     244343
89     243560
45     241408
49     240124
3      225250
80     220218
60     217919
71     216054
93     214293
86     209054
14     192299
75     165226
24     164127
35     139122
53     134812
88     107784
27     105040
74      48355
Name: hotel_cluster, dtype: int64

In [321]:
# They are the same users in both train and test datasets
test_ids = set(test.user_id.unique())
train_ids = set(train.user_id.unique())
intersection_count = len(test_ids & train_ids)
intersection_count == len(test_ids)

True

In [3]:
# load destination
destination = pd.read_csv("./destinations.csv")

In [323]:
destination.shape

(62106, 150)

In [324]:
destination.head()

Unnamed: 0,srch_destination_id,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d140,d141,d142,d143,d144,d145,d146,d147,d148,d149
0,0,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-1.897627,-2.198657,-2.198657,-1.897627,...,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657
1,1,-2.18169,-2.18169,-2.18169,-2.082564,-2.18169,-2.165028,-2.18169,-2.18169,-2.031597,...,-2.165028,-2.18169,-2.165028,-2.18169,-2.18169,-2.165028,-2.18169,-2.18169,-2.18169,-2.18169
2,2,-2.18349,-2.224164,-2.224164,-2.189562,-2.105819,-2.075407,-2.224164,-2.118483,-2.140393,...,-2.224164,-2.224164,-2.196379,-2.224164,-2.192009,-2.224164,-2.224164,-2.224164,-2.224164,-2.057548
3,3,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.115485,-2.177409,-2.177409,-2.177409,...,-2.161081,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409
4,4,-2.189562,-2.187783,-2.194008,-2.171153,-2.152303,-2.056618,-2.194008,-2.194008,-2.145911,...,-2.187356,-2.194008,-2.191779,-2.194008,-2.194008,-2.185161,-2.194008,-2.194008,-2.194008,-2.188037


# Sampling

In [123]:
import random
# sample by unique id
unique_users = train.user_id.unique()
sel_user_ids = [unique_users[i] for i in sorted(random.sample(range(len(unique_users)), 10000)) ]
sel_train = train[train.user_id.isin(sel_user_ids)]

In [125]:
# add year and month for each row
sel_train["date_time"] = pd.to_datetime(sel_train["date_time"])
sel_train["year"] = sel_train["date_time"].dt.year
sel_train["month"] = sel_train["date_time"].dt.month

In [126]:
# choose rows after July, 2014
t1 = sel_train[((sel_train.year == 2013) | ((sel_train.year == 2014) & (sel_train.month < 8)))]
t2 = sel_train[((sel_train.year == 2014) & (sel_train.month >= 8))]

In [127]:
# remove click events
t2 = t2[t2.is_booking == True]

In [330]:
# most frequent clusters
most_common_clusters = list(train.hotel_cluster.value_counts().head().index)

In [331]:
# apply most frequent clusters as the predictions
predictions = [most_common_clusters for i in range(t2.shape[0])]

In [332]:
# evaluate the performance
target = [[l] for l in t2["hotel_cluster"]]
metrics.mapk(target, predictions, k=5)

0.059464461994076998

In [333]:
# find correlation in training data
train.corr()["hotel_cluster"]

site_name                   -0.022408
posa_continent               0.014938
user_location_country       -0.010477
user_location_region         0.007453
user_location_city           0.000831
orig_destination_distance    0.007260
user_id                      0.001052
is_mobile                    0.008412
is_package                   0.038733
channel                      0.000707
srch_adults_cnt              0.012309
srch_children_cnt            0.016261
srch_rm_cnt                 -0.005954
srch_destination_id         -0.011712
srch_destination_type_id    -0.032850
is_booking                  -0.021548
cnt                          0.002944
hotel_continent             -0.013963
hotel_country               -0.024289
hotel_market                 0.034205
hotel_cluster                1.000000
Name: hotel_cluster, dtype: float64

In [134]:
# no columns correlate linearly with hotel_cluster. This makes sense, because there is no linear ordering to hotel_cluster. 
# this means that techniques like linear regression and logistic regression won’t work well on our data, 
# because they rely on linear correlations between predictors and targets.

In [128]:
# include the destination description
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
dest_desc = pca.fit_transform(destination[["d{0}".format(i) for i in range(1, 150)]])
dest_desc = pd.DataFrame(dest_desc)
dest_desc["srch_destination_id"] = destination["srch_destination_id"]

In [335]:
dest_desc.shape

(62106, 4)

# Feature Engineering

In [129]:
def feature_process(df):
    df["date_time"] = pd.to_datetime(df["date_time"])
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
    
    df_out = {}
    for seq in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
        df_out[seq] = getattr(df["date_time"].dt, seq)
    
    others = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
    for seq in others:
        df_out[seq] = df[seq]
    
    for seq in ["month", "day", "dayofweek", "quarter"]:
        df_out["ci_{0}".format(seq)] = getattr(df["srch_ci"].dt, seq)
        df_out["co_{0}".format(seq)] = getattr(df["srch_co"].dt, seq)
        
    df_out["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')
        
    df_out = pd.DataFrame(df_out)
    
    df_out = df_out.join(dest_desc, on="srch_destination_id", how='left', rsuffix="dest")
    df_out = df_out.drop("srch_destination_iddest", axis=1)
    return df_out

In [130]:
df = feature_process(t1)
df.fillna(-1, inplace=True)

In [132]:
df.shape

(196412, 39)

# Modeling

In [133]:
# function to return 5 prediction clusters
def predict_clusters(model, predictors):
    # save the cluster index in the model for later ranking
    dict_clusters = {}
    for (k,v) in enumerate(model.classes_):
        dict_clusters[k] = v
    
    # validate the modeling
    vali_predictions = model.predict_proba(predictors, cv)
    # take largest 5 probablities' index
    clusters_index = vali_predictions.argsort(axis=1)[:, -5:]
    
    # get cluster name by 5 index
    clusters = []
    for index in clusters_index.flatten():
        clusters.append(dict_clusters.get(index))
    
    # get the cluster predictions
    cluster_predictions = np.array(clusters).reshape(clusters_index.shape)
    
    return cluster_predictions

In [139]:
# predictors used for training
predictors = [c for c in df.columns if c not in ['hotel_cluster']]

In [245]:
import sklearn.cross_validation
# split the train and test sets
train_predictors, vali_predictors, train_labels, vali_labels = train_test_split(df[predictors], df['hotel_cluster'], test_size = 0.3)

## Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [247]:
rf_clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.1)

In [None]:
rf = rf_clf.fit(train_predictors, train_labels.astype(str))

In [None]:
pred = predict_clusters(rf, vali_predictors)

In [None]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

In [33]:
# classifiers tend to have lower accuracy when there is a high cluster count. 
# Instead try training 100 binary classifiers by one versus all

In [None]:
from sklearn.multiclass import OneVsRestClassifier

In [None]:
ovr = OneVsRestClassifier(rf_clf).fit(train_predictors, train_labels.astype(str))

In [35]:
pred = predict_clusters(ovr, vali_predictors)

In [36]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.071582962465710304)


## ExtraTree

In [218]:
from sklearn.ensemble import ExtraTreesClassifier

In [219]:
et_clf = ExtraTreesClassifier(n_estimators=10)

In [220]:
et = et_clf.fit(train_predictors, train_labels.astype(str))

In [221]:
pred = predict_clusters(et, vali_predictors)

In [222]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.093532771027085737)


## KNN

In [193]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto')

In [194]:
knn = knn_clf.fit(train_predictors, train_labels.astype(str))

In [195]:
pred = predict_clusters(knn, vali_predictors)

In [196]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.1025143970764601)


## Naive Bayes

In [197]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb_clf = GaussianNB()

In [None]:
gnb = gnb_clf.fit(train_predictors, train_labels.astype(str))

In [198]:
pred = predict_clusters(gnb, vali_predictors)

In [199]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.064674693956056392)


## SVM

In [18]:
from sklearn.svm import SVC

In [None]:
svm_clf = SVC(probability=True)

In [None]:
svm = svm_clf.fit(train_predictors[:10000], train_labels[:10000].astype(str))

In [20]:
pred = predict_clusters(svm, vali_predictors)

In [22]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.073794892058659065)


## Logistic Regression

In [212]:
from sklearn.linear_model import LogisticRegression

In [214]:
lg_clf = LogisticRegression()

In [215]:
lg = lg_clf.fit(train_predictors, train_labels.astype(str))

In [216]:
pred = predict_clusters(lg, vali_predictors)

In [217]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.05661699816713054)


## Ensemble methods

In [140]:
from sklearn.ensemble import AdaBoostClassifier

In [241]:
estimator = gnb_clf

In [242]:
# ensemble methods
from sklearn.ensemble import AdaBoostClassifier
adb_clf = AdaBoostClassifier(base_estimator=estimator, n_estimators=20)

In [243]:
adb = adb_clf.fit(train_predictors, train_labels.astype(str))

In [38]:
pred = predict_clusters(adb, vali_predictors)

In [211]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.064674693956056392)


## Bagging

In [236]:
# set the estimator
estimator = knn_clf

In [237]:
from sklearn.ensemble import BaggingClassifier
bagclf = BaggingClassifier(base_estimator=estimator, n_estimators=10)

In [238]:
bag = bagclf.fit(train_predictors, train_labels.astype(str))

In [239]:
predictions = predict_clusters(bag, vali_predictors)

In [240]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.093532771027085737)


# Test Modeling

In [52]:
train2 = train.copy()

In [53]:
train2["date_time"] = pd.to_datetime(train2["date_time"])
train2["year"] = train2["date_time"].dt.year
train2["month"] = train2["date_time"].dt.month

In [54]:
train2 = train2[train2.is_booking == True]

In [55]:
df2 = feature_process(train2)
df2.fillna(-1, inplace=True)

In [82]:
df2.drop(['is_booking', 'cnt'], inplace=True, axis=1)

In [96]:
df2.columns

Index([                  u'channel',                    u'ci_day',
                    u'ci_dayofweek',                  u'ci_month',
                      u'ci_quarter',                    u'co_day',
                    u'co_dayofweek',                  u'co_month',
                      u'co_quarter',                       u'day',
                       u'dayofweek',             u'hotel_cluster',
                 u'hotel_continent',             u'hotel_country',
                    u'hotel_market',                      u'hour',
                       u'is_mobile',                u'is_package',
                          u'minute',                     u'month',
       u'orig_destination_distance',            u'posa_continent',
                         u'quarter',                 u'site_name',
                 u'srch_adults_cnt',         u'srch_children_cnt',
             u'srch_destination_id',  u'srch_destination_type_id',
                     u'srch_rm_cnt',                 u'stay_sp

In [97]:
knn_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto')

In [99]:
predictors = [c for c in df2.columns if c not in ['user_id', 'hotel_cluster']]

In [100]:
neighbors = knn_clf.fit(df2[predictors], df2['hotel_cluster'].astype(str))

In [114]:
print("score:", metrics.mapk(df2['hotel_cluster'].astype(str), predictions, k=5))

('score:', 0.036996865675754539)


In [61]:
test["date_time"] = pd.to_datetime(test["date_time"])
test["year"] = test["date_time"].dt.year
test["month"] = test["date_time"].dt.month

In [62]:
test2 = feature_process(test)
test2.fillna(-1, inplace=True)

In [87]:
test2.drop(['id'], inplace=True, axis=1)

In [88]:
test2.shape

(2528243, 37)

In [101]:
predictions = predict_clusters(neighbors, test2[predictors])

In [102]:
predictions.shape

(2528243, 5)

# Output test results

In [105]:
predictions_output = map(lambda x: ' '.join(map(str,x)), predictions)

In [106]:
predictions_output[0:10]

['98 43 36 11 99',
 '42 62 18 72 2',
 '40 90 26 0 77',
 '40 99 88 79 1',
 '99 19 18 41 77',
 '19 48 28 77 32',
 '99 18 70 95 72',
 '41 9 55 98 37',
 '99 45 49 1 24',
 '40 7 34 3 32']

In [107]:
# load test data
submission = pd.read_csv('sample_submission.csv', sep = ',')

In [109]:
submission.shape

(2528243, 2)

In [110]:
submission['hotel_cluster'] = predictions_output

In [111]:
submission[['id', 'hotel_cluster']].to_csv('out.csv', index = False)

In [112]:
submission[:10]

Unnamed: 0,id,hotel_cluster
0,0,98 43 36 11 99
1,1,42 62 18 72 2
2,2,40 90 26 0 77
3,3,40 99 88 79 1
4,4,99 19 18 41 77
5,5,19 48 28 77 32
6,6,99 18 70 95 72
7,7,41 9 55 98 37
8,8,99 45 49 1 24
9,9,40 7 34 3 32
