In [1]:
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn.cross_validation
import ml_metrics as metrics

# Load datasets

In [115]:
# 24 raw features in train set, and 22 raw features in test set(is_booking, cnt, hotel clusters)
#'user_location_country' : np.str_, # The ID of the country the customer is located
#'user_location_region' : np.str_, # The ID of the region the customer is located
#'user_location_city' : np.str_, # The ID of the city the customer is located
#'orig_destination_distance':np.float64, # Physical distance between a hotel and a customer at the time of search. A null means the distance could not be calculated
#'channel' : np.str_, #	ID of a marketing channel
#'srch_ci' : np.str_, # Checkin date
#'srch_co' : np.str_, # Checkout date
#'srch_adults_cnt' : np.int32, # The number of adults specified in the hotel room
#'srch_children_cnt' : np.int32, # The number of (extra occupancy) children specified in the hotel room
#'srch_rm_cnt' : np.int32, # The number of hotel rooms specified in the search
#'srch_destination_id' : np.str_, # ID of the destination where the hotel search was performed
#'srch_destination_type_id' : np.str_, # Type of destination
#'hotel_continent' : np.str_, # Hotel continent
#'hotel_country' : np.str_, # Hotel country
#'hotel_market' : np.str_, # Hotel market
#'is_booking': bool, # 1 if a booking, 0 if a click not included in the test set
#'hotel_cluster' : np.str_, not included in test set
#'date_time': np.str_, # ID of a hotel cluster
# site_name : ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, ...) : int
# posa_continent : ID of continent associated with site_name : int
# user_id : ID of users : int
# is_mobile: 1 when a user connected from a mobile device, 0 otherwise : tinyint
# is_package : 1 if the click/booking was generated as a part of a package (i.e. combined with a flight), 0 otherwise : int
# cnt : Numer of similar events in the context of the same user session : bigint not included in test set
# id : only shown in test set

In [9]:
train = pd.read_csv("./train.csv", nrows=10)

In [10]:
train.columns

Index([u'date_time', u'site_name', u'posa_continent', u'user_location_country',
       u'user_location_region', u'user_location_city',
       u'orig_destination_distance', u'user_id', u'is_mobile', u'is_package',
       u'channel', u'srch_ci', u'srch_co', u'srch_adults_cnt',
       u'srch_children_cnt', u'srch_rm_cnt', u'srch_destination_id',
       u'srch_destination_type_id', u'is_booking', u'cnt', u'hotel_continent',
       u'hotel_country', u'hotel_market', u'hotel_cluster'],
      dtype='object')

In [3]:
# remove cnt not shown in test set
train.drop(['cnt'], inplace=True, axis=1)

In [119]:
train.shape

(37670293, 23)

In [6]:
test = pd.read_csv("./test.csv")

In [7]:
test.columns

Index([u'id', u'date_time', u'site_name', u'posa_continent',
       u'user_location_country', u'user_location_region',
       u'user_location_city', u'orig_destination_distance', u'user_id',
       u'is_mobile', u'is_package', u'channel', u'srch_ci', u'srch_co',
       u'srch_adults_cnt', u'srch_children_cnt', u'srch_rm_cnt',
       u'srch_destination_id', u'srch_destination_type_id', u'hotel_continent',
       u'hotel_country', u'hotel_market'],
      dtype='object')

In [5]:
# remove id in test set
test.drop(['id'], inplace=True, axis=1)

In [122]:
test.shape

(2528243, 21)

In [6]:
# load destination
destination = pd.read_csv("./destinations.csv")

# Sampling

In [189]:
# add year and month for each row
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month

In [190]:
train_2014 = train[train['year'] == 2014]

In [191]:
train_2014 = train_2014[train_2014.is_booking == True]

In [224]:
sample_index = np.random.choice(train_2014.index.tolist(), 100000, replace=False)

In [234]:
train_2014_sam = train_2014.loc[sample_index]

In [12]:
# include the destination description
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
dest_desc = pca.fit_transform(destination[["d{0}".format(i) for i in range(1, 150)]])
dest_desc = pd.DataFrame(dest_desc)
dest_desc["srch_destination_id"] = destination["srch_destination_id"]

In [335]:
dest_desc.shape

(62106, 4)

# Feature Engineering

In [13]:
def feature_process(df):
    df["date_time"] = pd.to_datetime(df["date_time"])
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
    
    df_out = {}
    for seq in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
        df_out[seq] = getattr(df["date_time"].dt, seq)
    
    others = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
    for seq in others:
        df_out[seq] = df[seq]
    
    for seq in ["month", "day", "dayofweek", "quarter"]:
        df_out["ci_{0}".format(seq)] = getattr(df["srch_ci"].dt, seq)
        df_out["co_{0}".format(seq)] = getattr(df["srch_co"].dt, seq)
        
    df_out["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')
        
    df_out = pd.DataFrame(df_out)
    
    df_out = df_out.join(dest_desc, on="srch_destination_id", how='left', rsuffix="dest")
    df_out = df_out.drop("srch_destination_iddest", axis=1)
    return df_out

In [237]:
df = feature_process(train_2014_sam)
df.fillna(-1, inplace=True)

In [238]:
df.shape

(100000, 39)

# Modeling

In [245]:
# function to return 5 prediction clusters
def predict_clusters(model, predictors):
    # save the cluster index in the model for later ranking
    dict_clusters = {}
    for (k,v) in enumerate(model.classes_):
        dict_clusters[k] = v
    
    # validate the modeling
    vali_predictions = model.predict_proba(predictors)
    # take largest 5 probablities' index
    #clusters_index = vali_predictions.argsort(axis=1)[:, -5:]
    clusters_index = vali_predictions.argsort(axis=1)[:,-np.arange(1,6)]
    # get cluster name by 5 index
    clusters = []
    for index in clusters_index.flatten():
        clusters.append(dict_clusters.get(index))
    
    # get the cluster predictions
    cluster_predictions = np.array(clusters).reshape(clusters_index.shape)
    
    return cluster_predictions

In [246]:
df.columns

Index([                  u'channel',                    u'ci_day',
                    u'ci_dayofweek',                  u'ci_month',
                      u'ci_quarter',                    u'co_day',
                    u'co_dayofweek',                  u'co_month',
                      u'co_quarter',                       u'day',
                       u'dayofweek',             u'hotel_cluster',
                 u'hotel_continent',             u'hotel_country',
                    u'hotel_market',                      u'hour',
                      u'is_booking',                 u'is_mobile',
                      u'is_package',                    u'minute',
                           u'month', u'orig_destination_distance',
                  u'posa_continent',                   u'quarter',
                       u'site_name',           u'srch_adults_cnt',
               u'srch_children_cnt',       u'srch_destination_id',
        u'srch_destination_type_id',               u'srch_rm_c

In [304]:
# predictors used for training
predictors = [c for c in df.columns if c not in ['user_id', 'is_booking', 'hotel_cluster']]

In [305]:
from sklearn.cross_validation import train_test_split
# split the train and test sets
train_predictors, vali_predictors, train_labels, vali_labels = train_test_split(df[predictors], df['hotel_cluster'], test_size = 0.3)

## Random Forest

In [306]:
from sklearn.ensemble import RandomForestClassifier

In [307]:
rf_clf = RandomForestClassifier(n_estimators=200)

In [308]:
rf = rf_clf.fit(train_predictors, train_labels.astype(str))

In [309]:
pred = predict_clusters(rf, vali_predictors)

In [310]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.1547361111111111)


## ExtraTree

In [258]:
from sklearn.ensemble import ExtraTreesClassifier

In [311]:
et_clf = ExtraTreesClassifier(n_estimators=200)

In [312]:
et = et_clf.fit(train_predictors, train_labels.astype(str))

In [261]:
pred = predict_clusters(et, vali_predictors)

In [262]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.11077138888888888)


## KNN

In [263]:
from sklearn.neighbors import KNeighborsClassifier

In [264]:
knn_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto')

In [265]:
knn = knn_clf.fit(train_predictors, train_labels.astype(str))

In [266]:
pred = predict_clusters(knn, vali_predictors)

In [267]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.074383888888888874)


## Naive Bayes

In [268]:
from sklearn.naive_bayes import GaussianNB

In [269]:
gnb_clf = GaussianNB()

In [270]:
gnb = gnb_clf.fit(train_predictors, train_labels.astype(str))

In [271]:
pred = predict_clusters(gnb, vali_predictors)

In [272]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.065816666666666662)


## Ensemble methods

In [273]:
from sklearn.ensemble import AdaBoostClassifier

In [274]:
estimator = gnb_clf

In [275]:
# ensemble methods
from sklearn.ensemble import AdaBoostClassifier
adb_clf = AdaBoostClassifier(base_estimator=estimator, n_estimators=20)

In [276]:
adb = adb_clf.fit(train_predictors, train_labels.astype(str))

In [277]:
pred = predict_clusters(adb, vali_predictors)

In [278]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.028490277777777778)


## Bagging

In [279]:
# set the estimator
estimator = knn_clf

In [280]:
from sklearn.ensemble import BaggingClassifier
bagclf = BaggingClassifier(base_estimator=estimator, n_estimators=10)

In [281]:
bag = bagclf.fit(train_predictors, train_labels.astype(str))

In [282]:
pred = predict_clusters(bag, vali_predictors)

In [283]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.076938055555555551)


## VotingClassifier

In [284]:
from sklearn.ensemble import VotingClassifier

In [285]:
estimators=[('knn', knn_clf), ('gnb', gnb_clf), ('rf', rf_clf)]

In [286]:
eclf = VotingClassifier(estimators=estimators, voting='soft')

In [287]:
vc = eclf.fit(train_predictors, train_labels.astype(str))

In [288]:
pred = predict_clusters(vc, vali_predictors)

In [289]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.087216388888888899)


In [290]:
from sklearn.grid_search import GridSearchCV

In [291]:
params = {'knn__n_neighbors': [3, 10], 'rf__n_estimators': [10, 100]}

In [292]:
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)

In [296]:
grid = grid.fit(train_predictors, train_labels.astype(str))

In [299]:
pred = predict_clusters(grid, vali_predictors)

AttributeError: 'GridSearchCV' object has no attribute 'classes_'

In [298]:
print("score:", metrics.mapk(vali_labels.astype(str), pred, k=5))

('score:', 0.087216388888888899)


# Test Modeling

In [65]:
train2 = train.copy()

In [66]:
train2["date_time"] = pd.to_datetime(train2["date_time"])
train2["year"] = train2["date_time"].dt.year
train2["month"] = train2["date_time"].dt.month

In [67]:
train2 = train2[train2.is_booking == True]

In [68]:
df2 = feature_process(train2)
df2.fillna(-1, inplace=True)

In [70]:
df2.drop(['is_booking'], inplace=True, axis=1)

In [97]:
vc = eclf.fit(train_predictors, train_labels.astype(str))

In [72]:
predictors2 = [c for c in df2.columns if c not in ['hotel_cluster']]

In [73]:
vc = eclf.fit(df2[predictors2], df2['hotel_cluster'].astype(str))

In [74]:
pred = predict_clusters(vc, df2[predictors2])

In [75]:
print("score:", metrics.mapk(df2['hotel_cluster'].astype(str), pred, k=5))

('score:', 0.2202684013326254)


In [76]:
test["date_time"] = pd.to_datetime(test["date_time"])
test["year"] = test["date_time"].dt.year
test["month"] = test["date_time"].dt.month

In [77]:
test2 = feature_process(test)
test2.fillna(-1, inplace=True)

In [79]:
test2.shape

(2528243, 37)

In [82]:
pred = predict_clusters(vc, test2[predictors2])

In [84]:
pred.shape

(2528243, 5)

# Output test results

In [86]:
predictions_output = map(lambda x: ' '.join(map(str,x)), pred)

In [87]:
predictions_output[0:10]

['4 70 21 97 59',
 '70 4 97 21 59',
 '65 96 64 13 98',
 '77 16 54 79 1',
 '91 15 13 18 21',
 '65 91 83 40 19',
 '91 54 19 95 50',
 '85 23 28 48 91',
 '45 0 54 55 1',
 '91 54 1 0 55']

In [88]:
# load test data
submission = pd.read_csv('sample_submission.csv', sep = ',')

In [89]:
submission.shape

(2528243, 2)

In [90]:
submission['hotel_cluster'] = predictions_output

In [91]:
submission[['id', 'hotel_cluster']].to_csv('out.csv', index = False)

In [92]:
submission[:10]

Unnamed: 0,id,hotel_cluster
0,0,4 70 21 97 59
1,1,70 4 97 21 59
2,2,65 96 64 13 98
3,3,77 16 54 79 1
4,4,91 15 13 18 21
5,5,65 91 83 40 19
6,6,91 54 19 95 50
7,7,85 23 28 48 91
8,8,45 0 54 55 1
9,9,91 54 1 0 55


In [2]:
sub = pd.read_csv('./submission_2016-05-27-12-19.csv')

In [3]:
sub.shape

(2528243, 2)

In [5]:
sub.tail()

Unnamed: 0,id,hotel_cluster
2528238,2528238,34 26 73 0 84
2528239,2528239,57 91 48 41 64
2528240,2528240,54 1 45 79 24
2528241,2528241,50 47 43 15 32
2528242,2528242,36 57 12 85 46
