In [1]:
from __future__ import print_function, absolute_import, division

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import matplotlib.pyplot as plt
import pickle
import csv
%matplotlib inline

from average_precision import apk, mapk

### Data

In [2]:
# Data
expedia_train = pd.read_csv('Data/train.csv', nrows=100000)

# Pull out features
train_cols = ['site_name', 'user_location_country', 'user_location_region', 'user_location_city',
              'is_mobile', 'is_package', 'channel', 'srch_adults_cnt',
              'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id',
              'hotel_continent','hotel_country', 'hotel_market', 'hotel_cluster']

# Save ID
user_id = expedia_train['user_id'].ravel()
expedia_train = expedia_train[train_cols]
X_train = expedia_train.iloc[:, :-1]
y_train = expedia_train.iloc[:, -1]

### Random Forest

In [3]:
rf_clf = RandomForestClassifier(n_estimators=100, max_features=4)
rf_clf.fit(X_train, y_train)
probs = rf_clf.predict_proba(X_train)

In [4]:
def top_5(x):
    return(np.argsort(x)[-5:][::-1])

def change_format(ls):
    return(' '.join([str(l) for l in ls]))

def pred(probs, user_id):
    # Get the top 5 hotel
    top5 = np.apply_along_axis(top_5, 1, probs)
    
    return([[id, change_format(top5[i])] for i, id in enumerate(user_id)])

predictions = pred(probs, user_id)

### Evaluation

In [5]:
# Load the user_hotel which can be used for evaluation
user_hotel = pickle.load(open("user_hotel.p", "rb"))

In [12]:
def evaluation(true_dict, prediction, k=5):
    actuals = []
    preds = []
    for _, p in enumerate(prediction):
        try:
            true_value = true_dict[p[0]]
        except:
            true_value = []
        pred = [float(h) for h in p[1].split(' ') if len(h) != 0]
        actuals.append(true_value)
        preds.append(pred)

    return(mapk(actuals, preds, k))

evaluation(user_hotel, predictions, k=5)

0.012117738704529412

### Test

In [7]:
expedia_test = pd.read_csv('Data/test.csv')
test_id = expedia_test['id'].ravel()
expedia_test = expedia_test[train_cols[:-1]]

In [9]:
predictions = []
i = 0
while i < len(expedia_test):
    if i % 500000 == 0: print("Starting " + str(i) + "th prediction")
    predictions.extend(pred(rf_clf.predict_proba(expedia_test[i:(i + 50000)]), test_id[i:(i + 50000)]))
    i += 50000

Starting 0th prediction
Starting 500000th prediction
Starting 1000000th prediction
Starting 1500000th prediction
Starting 2000000th prediction
Starting 2500000th prediction


In [10]:
with open('prediction_RF.csv', 'w') as outfile:
    csv_out = csv.writer(outfile)
    csv_out.writerow(['id', 'hotel_cluster'])
    for i, cluster in enumerate(predictions):
        csv_out.writerow([i, cluster])