# Expedia Hotel Recommendations

![Expedia](https://ppt.cc/fldtFx@.jpg)

<table>
  <tr>
    <td>Column name</td>
    <td>Description</td>
  </tr>
  <tr>
    <td>date_time</td>
    <td>Timestamp</td>
  </tr>
  <tr>
    <td>site_name</td>
    <td>ID of the Expedia point of sale</td>
  </tr>
  <tr>
    <td>posa_continent</td>
    <td>ID of continent associated with site_name</td>
  </tr>
  <tr>
    <td>user_location_country</td>
    <td>The ID of the country the customer is located</td>
  </tr>
  <tr>
    <td>user_location_region</td>
    <td>The ID of the region the customer is located</td>
  </tr>
  <tr>
    <td>user_location_city</td>
    <td>The ID of the city the customer is located</td>
  </tr>
  <tr>
    <td>orig_destination_distance</td>
    <td>Physical distance between a hotel and a customer at the time of search</td>
  </tr>
  <tr>
    <td>user_id</td>
    <td>ID of user</td>
  </tr>
  <tr>
    <td>is_mobile</td>
    <td>1 when a user connected from a mobile device, 0 otherwise</td>
  </tr>
  <tr>
    <td>is_package</td>
    <td>1 if the click/booking was generated as a part of a package</td>
  </tr>
  <tr>
    <td>channel</td>
    <td>ID of a marketing channel</td>
  </tr>
  <tr>
    <td>srch_ci</td>
    <td>Checkin date</td>
  </tr>
  <tr>
    <td>srch_co</td>
    <td>Checkout date</td>
  </tr>
  <tr>
    <td>srch_adults_cnt</td>
    <td>The number of adults specified in the hotel room</td>
  </tr>
  <tr>
    <td>srch_children_cnt</td>
    <td>The number of (extra occupancy) children specified in the hotel room</td>
  </tr>
  <tr>
    <td>srch_rm_cnt</td>
    <td>The number of hotel rooms specified in the search</td>
  </tr>
  <tr>
    <td>srch_destination_id</td>
    <td>ID of the destination where the hotel search was performed</td>
  </tr>
  <tr>
    <td>srch_destination_type_id</td>
    <td>Type of destination</td>
  </tr>
  <tr>
    <td>hotel_continent</td>
    <td>Hotel continent</td>
  </tr>
  <tr>
    <td>hotel_country</td>
    <td>Hotel country</td>
  </tr>
  <tr>
    <td>hotel_market</td>
    <td>Hotel market</td>
  </tr>
  <tr>
    <td>is_booking</td>
    <td>1 if a booking, 0 if a click</td>
  </tr>
  <tr>
    <td>cnt</td>
    <td>Number of similar events in the context of the same user session</td>
  </tr>
  <tr>
    <td>hotel_cluster</td>
    <td>ID of a hotel cluster</td>
  </tr>
    
</table>

In [1]:
import os

mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-7.3.0-posix-seh-rt_v5-rev0\\mingw64\\bin'

os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [2]:
import numpy as np
import pandas as pd
import sys
from sklearn.model_selection import cross_val_score

## 匯入data

In [3]:
destinations = pd.read_csv("destinations.csv")

In [4]:
test=pd.read_csv('test.csv')

In [5]:
train = pd.read_csv("train.csv")

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37670293 entries, 0 to 37670292
Data columns (total 24 columns):
date_time                    object
site_name                    int64
posa_continent               int64
user_location_country        int64
user_location_region         int64
user_location_city           int64
orig_destination_distance    float64
user_id                      int64
is_mobile                    int64
is_package                   int64
channel                      int64
srch_ci                      object
srch_co                      object
srch_adults_cnt              int64
srch_children_cnt            int64
srch_rm_cnt                  int64
srch_destination_id          int64
srch_destination_type_id     int64
is_booking                   int64
cnt                          int64
hotel_continent              int64
hotel_country                int64
hotel_market                 int64
hotel_cluster                int64
dtypes: float64(1), int64(20), object(3)

## 轉換日期格式

In [7]:
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month

## 隨機抽取10000個用戶，並將原訓練集分為新訓練集t1及新測試集t2

In [8]:
import  random

unique_users= train.user_id.unique()

sel_user_ids = [unique_users[i] for i in sorted(random.sample(range(len(unique_users)), 10000)) ]
sel_train = train[train.user_id.isin(sel_user_ids)]

In [9]:
t1 = sel_train[((sel_train.year == 2013) | ((sel_train.year == 2014) & (sel_train.month < 8)))]
t2 = sel_train[((sel_train.year == 2014) & (sel_train.month >= 8))]

## 選擇feature

In [41]:
select_feature =  ['site_name','posa_continent','user_location_country','user_location_region','user_location_city','is_mobile', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id','srch_destination_type_id','hotel_market', 'hotel_country']

In [42]:
x_train = t1[select_feature]
y_train = t1['hotel_cluster']

In [43]:
x_test = test[select_feature]

## 使用RandomForest進行訓練
[RandomForest](http://scikit-learn.org/stable/modules/ensemble.html#forest)

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.1)

In [16]:
clf.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.1, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf, x_train, y_train, cv=10).mean()

0.056719566440129

## 使用XGboost進行訓練
[XGboost](http://xgboost.readthedocs.io/en/latest/python/python_intro.html)

In [51]:
import xgboost as xgb
from xgboost import XGBClassifier

In [52]:
xgbc = XGBClassifier()

In [20]:
xgbc.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [21]:
xgbc.score(x_train,y_train)

  if diff:


0.163010023286423

## 使用nearest neighbors進行訓練
[nearest neighbors](http://scikit-learn.org/stable/modules/neighbors.html#classification)

In [16]:
from sklearn.neighbors.nearest_centroid import NearestCentroid

In [17]:
clf_nearest = NearestCentroid()

In [18]:
clf_nearest.fit(x_train,y_train)

NearestCentroid(metric='euclidean', shrink_threshold=None)

In [21]:
cross_val_score(clf_nearest, x_train, y_train, cv=10).mean()

0.006991130677529817

## keras

In [13]:
import keras
from keras.layers import Dense , Dropout
from keras.models import Sequential

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [14]:
keras_model = keras.models.Sequential()

In [15]:
keras_model.add(keras.layers.Dense(30,kernel_initializer='uniform',input_dim=13 , activation=('relu')))

In [16]:
keras_model.add(keras.layers.Dense(5,kernel_initializer='uniform',activation=('relu'))) 

In [17]:
keras_model.add(keras.layers.Dense(1,kernel_initializer='uniform',activation=('sigmoid'))) 

In [18]:
keras_model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])

In [19]:
keras_model.fit(x_train, y_train,epochs=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1d2b103b828>

In [32]:
acc_trainacc_trai = keras_model.evaluate(x_train, y_train)
acc_trainacc_trai[1]



0.013256656518264673