# Algorithm Election for Single Criterion Recommender System (I)

## 1. Selection of the algorithm

Let's use the structure on https://github.com/NicolasHug/Surprise/blob/master/examples/benchmark.py to evaluate different algorithms to use on the recommender system with the data we have.

In any case I want to compare item-based and user-based approaches when possible. I believe SVD does not allow for such.

Once I have done that, I will implement that specific one on the recommender.


In [None]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import time
import datetime
import random
import os

import numpy as np
import six
from tabulate import tabulate

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNBaseline
from surprise import SVD
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering


The default is **user-based**, if we want **item-based** we need to specify the parameter to be false.

The default similarity is **MSD**.

Anything else?

In [None]:
classes = {'SVD':SVD, 'NMF':NMF, 'SlopeOne':SlopeOne, 'KNNBasic':KNNBasic, 'KNNWithMeans':KNNWithMeans, 
           'KNNBaseline':KNNBaseline,'CoClustering':CoClustering, 'BaselineOnly':BaselineOnly, 
           'NormalPredictor':NormalPredictor}

# set RNG
np.random.seed(0)
random.seed(0)

file_path = os.path.expanduser('/home/jonas/Desktop/SpringBoard_Capstone_1/FIRST ATTEMPT/generated_ratings_1_reduced.csv')
reader = Reader(line_format='item rating user', sep=',')
data = Dataset.load_from_file(file_path, reader=reader)

kf = KFold(random_state=0)  # folds will be the same for all algorithms.

table = []

for name, klass in classes.items():
    start = time.time()
    out = cross_validate(klass(), data, ['rmse', 'mae', 'fcp'], kf, verbose=False)
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))
    mean_fcp = '{:.3f}'.format(np.mean(out['test_fcp']))
    
    new_line = [name, mean_rmse, mean_mae, mean_fcp, cv_time]
    table.append(new_line)
print('\n\n User-based recommenders \n')
header = ['Name',
          'RMSE',
          'MAE',
          'FCP',
          'Time'
          ]
print(tabulate(table, header, tablefmt="pipe"))


In [None]:
sim_options = {'name': 'msd', 'user_based': False}
classes = {'KNNBasic':KNNBasic(sim_options=sim_options), 
           'KNNWithMeans':KNNWithMeans(sim_options=sim_options), 
           'KNNBaseline':KNNBaseline(sim_options=sim_options)}

# set RNG
np.random.seed(0)
random.seed(0)

file_path = os.path.expanduser('/home/jonas/Desktop/SpringBoard_Capstone_1/FIRST ATTEMPT/generated_ratings_1_reduced.csv')
reader = Reader(line_format='item rating user', sep=',')
data = Dataset.load_from_file(file_path, reader=reader)

kf = KFold(random_state=0)  # folds will be the same for all algorithms.

table = []

for name, klass in classes.items():
    start = time.time()
    out = cross_validate(klass, data, ['rmse', 'mae', 'fcp'], kf, verbose=False)
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))
    mean_fcp = '{:.3f}'.format(np.mean(out['test_fcp']))
    
    new_line = [name, mean_rmse, mean_mae, mean_fcp, cv_time]
    table.append(new_line)
print('\n\n Item-based recommenders \n')
header = ['Name',
          'RMSE',
          'MAE',
          'FCP',
          'Time'
          ]
print(tabulate(table, header, tablefmt="pipe"))

Indeed, it seems that KNN performs a bit better. Even though we cannot see a delay in run time, it is more expensive to go item-based, so we won't by default

How about another similarity?

In [None]:
sim_options = {'name': 'msd'}
classes = {'KNNBasic':KNNBasic(sim_options=sim_options), 
           'KNNWithMeans':KNNWithMeans(sim_options=sim_options), 
           'KNNBaseline':KNNBaseline(sim_options=sim_options)}

# set RNG
np.random.seed(0)
random.seed(0)

file_path = os.path.expanduser('/home/jonas/Desktop/SpringBoard_Capstone_1/FIRST ATTEMPT/generated_ratings_1_reduced.csv')
reader = Reader(line_format='item rating user', sep=',')
data = Dataset.load_from_file(file_path, reader=reader)

kf = KFold(random_state=0)  # folds will be the same for all algorithms.

table = []

for name, klass in classes.items():
    start = time.time()
    out = cross_validate(klass, data, ['rmse', 'mae', 'fcp'], kf, verbose=False)
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))
    mean_fcp = '{:.3f}'.format(np.mean(out['test_fcp']))
    
    new_line = [name, mean_rmse, mean_mae, mean_fcp, cv_time]
    table.append(new_line)
print('\n\n Item-based recommenders \n')
header = ['Name',
          'RMSE',
          'MAE',
          'FCP',
          'Time'
          ]
print(tabulate(table, header, tablefmt="pipe"))

In [None]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

***

## 2. Example of implementation

Okay, now we know we want to use a kNN Baseline algorithm (for now). So... 

* for an existing user with some ratings, what would it be the best hotels? How do we do that?
* what are similar hotels to a specific one?
* What if a user doesn't have any ratings? Should we just add the most popular hotels for the area?

Let's start again by training the algorithm. Then, I will chose both: user with ratings and user without ratings, and will look at the output. Finally, let's try to look at hotels similar to a specific one.

In [33]:
# import specific libraries
import numpy as np
import pandas as pd
import os

from surprise import Dataset
from surprise import Reader
from surprise import KNNBaseline

# read data
file_path = os.path.expanduser('/home/jonas/Desktop/SpringBoard_Capstone_1/FIRST ATTEMPT/generated_ratings_1_reduced.csv')
reader = Reader(line_format='item rating user', sep=',')
data = Dataset.load_from_file(file_path, reader=reader)

# train kNN-Baseline on the whole collection (both, user and item-wise)
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
algo = KNNBaseline()
algo.fit(trainset)
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo_items = KNNBaseline(sim_options=sim_options)
algo_items.fit(trainset)

######################################################################
# Best hotels for user XX
# list of hotels...
hoteldf = pd.read_csv('/home/jonas/Desktop/SpringBoard_Capstone_1/FIRST ATTEMPT/generated_ratings_1_reduced.csv', header=None, names=['item', 'rating','user'])
hotels = hoteldf['item'].unique().tolist()

# case 1
user1 = '3'
hot_ratings_user = {}
# loop to find ratings
for hot in hotels:
    pred = algo.predict(user1, hot)
    hot_ratings_user[hot] = pred.est
# the whole dictionary should be done now... I want the top N = 10
print('\n\nTop 10 hotels for user', user1, ':')
sorted_hot_ratings_user = sorted(hot_ratings_user, key=hot_ratings_user.get, reverse=True)[:10]
for key in sorted_hot_ratings_user:
    print(key, ':', hot_ratings_user[key])
    
# case 1.5
user1 = '3'
hot_ratings_user = {}
# loop to find ratings
for hot in hotels:
    pred = algo_items.predict(user1, hot)
    hot_ratings_user[hot] = pred.est
# the whole dictionary should be done now... I want the top N = 10
print('\n\nTop 10 hotels for user', user1, ' (item-based):')
sorted_hot_ratings_user = sorted(hot_ratings_user, key=hot_ratings_user.get, reverse=True)[:10]
for key in sorted_hot_ratings_user:
    print(key, ':', hot_ratings_user[key])
    
# case 2
user2 = '2000' # there's 1789 users with ratings...
hot_ratings_user = {}
# loop to find ratings
for hot in hotels:
    pred = algo.predict(user2, hot)
    hot_ratings_user[hot] = pred.est
# the whole dictionary should be done now... I want the top N = 10
print('\n\nTop 10 hotels for user', user2, ' (who has no reviews whatsoever):')
sorted_hot_ratings_user = sorted(hot_ratings_user, key=hot_ratings_user.get, reverse=True)[:10]
for key in sorted_hot_ratings_user:
    print(key, ':', hot_ratings_user[key])
    
# case 3
user2 = '2100' # there's 1789 users with ratings...
hot_ratings_user = {}
# loop to find ratings
for hot in hotels:
    pred = algo.predict(user2, hot)
    hot_ratings_user[hot] = pred.est
# the whole dictionary should be done now... I want the top N = 10
print('\n\nTop 10 hotels for user', user2, ' (who has no reviews whatsoever either):')
sorted_hot_ratings_user = sorted(hot_ratings_user, key=hot_ratings_user.get, reverse=True)[:10]
for key in sorted_hot_ratings_user:
    print(key, ':', hot_ratings_user[key])

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


Top 10 hotels for user 3 :
blue moon hotel : 5
homewood suites las vegas airport : 5
holiday inn express hotel and suites las vegas 215 beltway : 4.996907332713799
comfort inn midtown manhattan : 4.996907332713798
allerton hotel : 4.9849054020480885
serrano hotel a kimpton hotel : 4.9574337074161665
four seasons hotel san francisco : 4.948217391789254
the bowery hotel : 4.937430263528778
hotel vitale : 4.937430263528778
the peninsula chicago : 4.933398035411475


Top 10 hotels for user 3  (item-based):
kitano new york : 5
best western hospitality house : 4.920003316334549
omni san francisco hotel : 4.842450737561286
columbus motor inn : 4.82953612873261
plaza athenee hotel : 4.692137807975122
hilton club new york : 4.669006284896156
w san francisco : 4.58492345350793

In [32]:
########################################################################
# Hotels similar to XX
hotel = 'courtyard by marriott new york manhattan upper east side'

# need to convert hotel to id
h_inner_id = algo.trainset.to_inner_iid(hotel)

# neigbouring ids
hotel_neighbors = algo_items.get_neighbors(h_inner_id, k=10)

# take them back to names
hotel_neighbors = (algo_items.trainset.to_raw_iid(inner_id) for inner_id in hotel_neighbors)

# boom
for hotl in hotel_neighbors:
    print(hotl)


# most popular hotels? From original dataset, we want the higest average given a minimum number of ratings

the kimberly hotel
sofitel chicago water tower
the peninsula chicago
lowell hotel
the plaza
the gem hotel chelsea
greenwich hotel
sofitel new york
the carlyle a rosewood hotel
the ritz carlton
