In [1]:
from __future__ import print_function, absolute_import, division

import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import pickle
import csv
%matplotlib inline

from average_precision import apk, mapk

### Data

In [2]:
# Data
expedia_train = pd.read_csv('Data/train.csv', nrows=500000)

# Pull out features
train_cols = ['site_name', 'user_location_country', 'user_location_region', 'user_location_city',
              'is_mobile', 'is_package', 'channel', 'srch_adults_cnt',
              'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id',
              'hotel_continent','hotel_country', 'hotel_market', 'hotel_cluster']

# Save ID
user_id = expedia_train['user_id'].ravel()
expedia_train = expedia_train[train_cols]
X_train = expedia_train.iloc[:, :-1]
y_train = expedia_train.iloc[:, -1]

means = X_train.apply(np.mean, axis=0).ravel()
stds = X_train.apply(np.std, axis=0).ravel()

# Standardized the data
X_norm = preprocessing.scale(X_train)

# Transform y_train into a matrix form
y = np.zeros((100, len(y_train)))

for i in range(len(y_train)):
    y[:, i] = np.eye(100)[y_train[i]]

### Neural Network

In [3]:
# Sigmoid function
def s(r):
    return(1/(1 + np.exp(-r)))

# tanh function
def tanh(z):
    return(2*s(2*z) - 1)

# Risk function for mean squared error loss
def risk_f1(X, y, V, W, x_size, h_size):
    h = tanh((V.dot(np.insert(X, x_size, 1, axis=1).T)))
    z = s(W.dot(np.insert(h, h_size, 1, axis=0)))
    return(np.sum((z - y)**2))

In [4]:
def trainNeuralNetwork(X, labels, epsilon, change_point, max_iter,
                       x_size=15, h_size=50, z_size=100):
    # Initialize weight
    V = np.random.uniform(-1, 1, (h_size, x_size))
    W = np.random.uniform(-1, 1, (z_size, h_size))
    # Intercept
    V = np.insert(V, x_size, 1, axis=1)
    W = np.insert(W, h_size, 1, axis=1)
    
    i = 0
    while i <= max_iter:
        if i in change_point:
            epsilon = epsilon/10
            print("After ", str(i), "th iterations", "\n" 
                  "The learning rate changes to ",
                  str(epsilon), sep='')
        if i % (max_iter/10) == 0:
            print('Finishing iteration ', str(i), '\n',
                  'Risk at this iteration is ',
                  str(risk_f1(X, labels, V, W, x_size, h_size)), sep='')
            
        # pick one data point randomly
        index = np.random.choice(np.arange(len(X)))
        
        # Forward pass
        h = tanh(V.dot(np.insert(X[index], x_size, 1)))
        z = s(W.dot(np.insert(h, h_size, 1)))
        
        # Backward pass
        z_grad = z - labels[:, index]
        W_grad = ((z_grad*z*(1 - z)).reshape(100, 1)*h.reshape(1, h_size))
        h_grad = (z_grad*z*(1 - z)).reshape(1, 100).dot(W[:, :-1]).ravel()
        V_grad = ((h_grad*(1 - h**2)).reshape(h_size, 1)*images[index])
        
        # stochastic gradient descent update
        V[:, :-1] = V[:, :-1] - epsilon*V_grad
        W[:, :-1] = W[:, :-1] - epsilon*W_grad
        
        i += 1
        
    return(V, W)

In [5]:
np.random.seed(0)
change_point = (50000, 150000, 300000)
V, W = trainNeuralNetwork(X_norm, y, 0.1, change_point, 500000)

Finishing iteration 0
Risk at this iteration is 24824585.5736
After 50000th iterations
The learning rate changes to 0.01
Finishing iteration 50000
Risk at this iteration is 498185.753702
Finishing iteration 100000
Risk at this iteration is 497346.892196
After 150000th iterations
The learning rate changes to 0.001
Finishing iteration 150000
Risk at this iteration is 496470.425532
Finishing iteration 200000
Risk at this iteration is 496406.351742
Finishing iteration 250000
Risk at this iteration is 496343.37726
After 300000th iterations
The learning rate changes to 0.0001
Finishing iteration 300000
Risk at this iteration is 496294.292905
Finishing iteration 350000
Risk at this iteration is 496288.785152
Finishing iteration 400000
Risk at this iteration is 496283.863711
Finishing iteration 450000
Risk at this iteration is 496278.644469
Finishing iteration 500000
Risk at this iteration is 496274.132167


In [6]:
def predictNeuralNetwork(V, W, X):
    h = tanh((V.dot(np.insert(X, 15, 1, axis=1).T)))
    z = s(W.dot(np.insert(h, 50, 1, axis=0)))
    
    return(z)

In [8]:
prediction = predictNeuralNetwork(V, W, X_norm)

In [9]:
def top_5(x):
    return(np.argsort(x)[-5:][::-1])

def change_format(ls):
    return(' '.join([str(l) for l in ls]))

def pred(probs, user_id):
    # Get the top 5 hotel
    top5 = np.apply_along_axis(top_5, 1, probs)
    
    return([[id, change_format(top5[i])] for i, id in enumerate(user_id)])

### Evaluation

In [10]:
# Load the user_hotel which can be used for evaluation
user_hotel = pickle.load(open("user_hotel.p", "rb"))

In [11]:
def evaluation(true_dict, prediction, k=5):
    actuals = []
    preds = []
    for _, p in enumerate(prediction):
        try:
            true_value = true_dict[p[0]]
        except:
            true_value = []
        pred = [float(h) for h in p[1].split(' ') if len(h) != 0]
        actuals.append(true_value)
        preds.append(pred)

    return(mapk(actuals, preds, k))

predictions = pred(prediction.T, user_id)
evaluation(user_hotel, predictions, k=5)

0.08119796833333337

### Test

In [12]:
expedia_test = pd.read_csv('Data/test.csv')
test_id = expedia_test['id'].ravel()
expedia_test = expedia_test[train_cols[:-1]]
# Normalize
expedia_test_norm = (expedia_test.as_matrix() - means)/stds

In [13]:
predictions = predictNeuralNetwork(V, W, expedia_test_norm)
predictions = pred(predictions.T, test_id)

In [14]:
with open('prediction_NN.csv', 'w') as outfile:
    csv_out = csv.writer(outfile)
    csv_out.writerow(['id', 'hotel_cluster'])
    for i, cluster in enumerate(predictions):
        csv_out.writerow([i, cluster])