# Init

In [1]:
import random

import numpy as np
import pandas as pd

from surprise import Dataset, Reader, KNNWithZScore, SVD
from surprise.model_selection import KFold, cross_validate

from hyperopt import hp, tpe, fmin, Trials, STATUS_OK

from ContentALS import ContentALS
from Metrics import mae, rmse, ndcg, full_ndcg

In [2]:
final_score = {}

In [3]:
ITEMS_NUM = 1993 # used for monkey patching of missing functionality in surprise library

def predict_for_user(self, user):
    global ITEMS_NUM
    return np.array([self.predict(user, item).est for item in range(ITEMS_NUM)])
SVD.predict_for_user = predict_for_user
KNNWithZScore.predict_for_user = predict_for_user

In [4]:
DIVIDER = 10

items = pd.read_csv('dataset/YELP/business_attributes.csv', low_memory=False)
ratings = pd.read_csv('dataset/YELP/ratings.csv', low_memory=False)
ratings.dropna(inplace=True)

for col in items.columns:
    if (col != 'iid') and (col != 'name'):
        items[col] /= DIVIDER

K_CORE = 20

end = False
while not end:
    for left, right in zip(('iid', 'uid'), ('uid', 'iid')):                   
        kept = ratings.groupby(left).count()
        
        end = kept[right].min() >= K_CORE
        
        kept = kept[kept[right] >= K_CORE]
        kept = kept.reset_index()[[left]]
        ratings = kept.merge(ratings,on=left, how='left')
ratings = ratings[['uid', 'iid', 'rating']]

In [5]:
uid = ratings.uid.unique()
uid = pd.DataFrame({'uid': uid, 'uid_new': np.arange(uid.shape[0], dtype=np.int)})
ratings = ratings.merge(uid, on='uid', how='left')
ratings.drop('uid', axis=1, inplace=True)
ratings.rename({'uid_new': 'uid'}, axis=1, inplace=True)

iid = ratings.iid.unique()
iid = pd.DataFrame({'iid': iid, 'iid_new': np.arange(iid.shape[0], dtype=np.int)})
ratings = ratings.merge(iid, on='iid', how='left')
ratings.drop('iid', axis=1, inplace=True)
ratings.rename({'iid_new': 'iid'}, axis=1, inplace=True)
items = iid.merge(items, on='iid', how='left')
items.drop('iid', axis=1, inplace=True)
items.rename({'iid_new': 'iid'}, axis=1, inplace=True)

ratings = ratings[['uid', 'iid', 'rating']]

In [6]:
SPLITS = 4 # number of k-fold splits
MAX_EVALS = 100 # number of trials in hyperparam search

np.random.seed = 0
random.seed = 0
data = Dataset.load_from_df(ratings[['uid', 'iid', 'rating']], Reader(rating_scale=[1, 5]))
kf = KFold(n_splits=SPLITS)

surprise_sets = [(trainset, testset) for trainset, testset in kf.split(data)]
my_sets = [(pd.DataFrame(trainset.build_testset(), columns=['uid', 'iid', 'rating']),
            pd.DataFrame(testset, columns=['uid', 'iid', 'rating'])) for (trainset, testset) in surprise_sets]

In [7]:
def init_status():
    empty_status = {
        'status': STATUS_OK,
        'loss': 0,
        'metrics': {
            'rmse': [],
            'rmse_stddev': 0,
            'mae': [],
            'mae_stddev': 0,
            'ndcg@5': [],
            'ndcg@5_stddev': 0,
            'ndcg@20': [],
            'ndcg@20_stddev': 0,
            'ndcg@50': [],
            'ndcg@50_stddev': 0,
        }
    }
    return empty_status

def update_status(status, predictions, algo=None):
    val = rmse(predictions)
    status['metrics']['rmse'].append(val)
        
    val = mae(predictions)
    status['metrics']['mae'].append(val)
    
    if algo is not None:
        val = full_ndcg(algo, predictions, 5)
        status['metrics']['ndcg@5'].append(val)

        val = full_ndcg(algo, predictions, 20)
        status['metrics']['ndcg@20'].append(val)

        val = full_ndcg(algo, predictions, 50)
        status['metrics']['ndcg@50'].append(val)
    
    return status

def merge_splits_status(status):
    keys = ['rmse', 'mae']
    if len(status['metrics']['ndcg@5']) > 0:
        keys.extend(['ndcg@5', 'ndcg@20', 'ndcg@50'])
    for key in keys:
        status['metrics'][f'{key}_stddev'] = np.std(status['metrics'][key])
        status['metrics'][key] = np.mean(status['metrics'][key])
    status['loss'] = status['metrics']['rmse']
    
    return status

# Verify - warm start

In [8]:
# SVD
space = {
    'n_factors': hp.loguniform('n_factors', 2.305, 5.2985),
    'biased': hp.choice('biased', [False, True]),
    'lr_all': hp.loguniform('lr_all', -8, 0),
    'reg_all': hp.loguniform('reg_all', -8, 0)
}

def test(calculate_ndcg, kwargs):
    kwargs['n_factors'] = int(kwargs['n_factors'])
    result = init_status()
    
    for (trainset, testset) in surprise_sets:
        np.random.seed = 0
        random.seed = 0
        algo = SVD(**kwargs)
        algo.fit(trainset)
        predictions = algo.test(testset, verbose=False)
        predictions = pd.DataFrame(predictions, columns=['uid', 'iid', 'rating', 'prediction', 'meta'])
        
        passed_algo = algo if calculate_ndcg else None
        result = update_status(result, predictions, passed_algo)
        
    return merge_splits_status(result)

svd_trials = Trials()
best = fmin((lambda kwargs: test(False, kwargs)),
    space=space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=svd_trials)

print(best)
final_score['SVD'] = (best, test(True, best))
print(final_score['SVD'])

100%|██████████| 100/100 [22:51<00:00,  9.51s/it, best loss: 0.9189566365020895]
{'biased': 1, 'lr_all': 0.004569368257987339, 'n_factors': 16.713194690258256, 'reg_all': 0.15637897307701845}
({'biased': 1, 'lr_all': 0.004569368257987339, 'n_factors': 16, 'reg_all': 0.15637897307701845}, {'status': 'ok', 'loss': 0.9190103130206801, 'metrics': {'rmse': 0.9190103130206801, 'rmse_stddev': 0.0021517852869819645, 'mae': 0.7216826137614722, 'mae_stddev': 0.0016951517077698275, 'ndcg@5': 0.2208674160927429, 'ndcg@5_stddev': 0.03219581929377904, 'ndcg@20': 0.09247275866053808, 'ndcg@20_stddev': 0.006298602174826437, 'ndcg@50': 0.07396060069331893, 'ndcg@50_stddev': 0.00360669159304824}})


In [9]:
# Hybrid ALS
space = {
    'base_rank': hp.loguniform('base_rank', 2.305, 5.2985),
    'user_reg_loss': hp.loguniform('user_reg_loss', -8, 0),
    'damping_factor': hp.quniform('damping_factor', 0, 50, 1)
}

def test(calculate_ndcg, kwargs):
    kwargs['base_rank'] = int(kwargs['base_rank'])
    kwargs['item_reg_loss'] = kwargs['user_reg_loss']
    result = init_status()
    
    for (train, test) in my_sets:
        np.random.seed = 0
        random.seed = 0
        algo = ContentALS(np.max(ratings['uid'])+1, np.max(items['iid'])+1,
                          np.zeros(shape=(np.max(ratings['uid'])+1, 1)),
                          items.drop(['iid', 'name'], axis=1),
                          **kwargs)
        algo.init_ratings(train)
        algo.train(iterations=20)
        predictions = algo.predict(test)
        
        passed_algo = algo if calculate_ndcg else None
        result = update_status(result, predictions, passed_algo)
        
    return merge_splits_status(result)

content_als_trials = Trials()
best = fmin(lambda kwargs: test(False, kwargs),
    space=space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=content_als_trials)

print(best)
final_score['CALS'] = (best, test(True, best))
print(final_score['CALS'])

100%|██████████| 100/100 [1:25:48<00:00, 84.27s/it, best loss: 0.9191915215706021]
{'base_rank': 19.49820968878596, 'damping_factor': 6.0, 'user_reg_loss': 0.36849146706104613}
({'base_rank': 19, 'damping_factor': 6.0, 'user_reg_loss': 0.36849146706104613, 'item_reg_loss': 0.36849146706104613}, {'status': 'ok', 'loss': 0.9191913025427765, 'metrics': {'rmse': 0.9191913025427765, 'rmse_stddev': 0.002267470421852822, 'mae': 0.7216539375822536, 'mae_stddev': 0.0017359181481073833, 'ndcg@5': 0.21237048889267923, 'ndcg@5_stddev': 0.016642462223561968, 'ndcg@20': 0.09534493196479071, 'ndcg@20_stddev': 0.00461349187225295, 'ndcg@50': 0.07657091494715271, 'ndcg@50_stddev': 0.0038649262232457503}})


In [10]:
# ALS
space = {
    'base_rank': hp.loguniform('base_rank', 2.305, 5.2985),
    'user_reg_loss': hp.loguniform('user_reg_loss', -8, 0),
    'damping_factor': hp.quniform('damping_factor', 0, 50, 1)
}

def test(calculate_ndcg, kwargs):
    kwargs['base_rank'] = int(kwargs['base_rank'])
    kwargs['item_reg_loss'] = kwargs['user_reg_loss']
    result = init_status()
    for (train, test) in my_sets:
        np.random.seed = 0
        random.seed = 0
        algo = ContentALS(np.max(ratings['uid'])+1, np.max(ratings['iid'])+1,
                          np.zeros(shape=(np.max(ratings['uid'])+1, 1)),
                          np.zeros(shape=(np.max(ratings['iid'])+1, 1)),
                          **kwargs)
        algo.init_ratings(train)
        algo.train(iterations=20)
        predictions = algo.predict(test)
        
        passed_algo = algo if calculate_ndcg else None
        result = update_status(result, predictions, passed_algo)
        
    return merge_splits_status(result)

als_trials = Trials()
best = fmin(lambda kwargs: test(False, kwargs),
    space=space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=als_trials)

print(best)
final_score['ALS'] = (best, test(True, best))
print(final_score['ALS'])

100%|██████████| 100/100 [2:32:45<00:00, 112.57s/it, best loss: 0.9193028231021314] 
{'base_rank': 89.61819870901566, 'damping_factor': 6.0, 'user_reg_loss': 0.7640693899705812}
({'base_rank': 89, 'damping_factor': 6.0, 'user_reg_loss': 0.7640693899705812, 'item_reg_loss': 0.7640693899705812}, {'status': 'ok', 'loss': 0.9193028231021314, 'metrics': {'rmse': 0.9193028231021314, 'rmse_stddev': 0.0022817667949657705, 'mae': 0.7218528124268692, 'mae_stddev': 0.0017377559536737842, 'ndcg@5': 0.21430857363242373, 'ndcg@5_stddev': 0.01717847739069103, 'ndcg@20': 0.09634636901937876, 'ndcg@20_stddev': 0.005469531451883807, 'ndcg@50': 0.07653054923693942, 'ndcg@50_stddev': 0.003994800506673354}})


In [12]:
# KNN
space = {
    'k': hp.uniform('k', 3, 100),
    'sim_options': {
        'name': hp.choice('name', ['cosine', 'msd', 'pearson']),
        'user_based': hp.choice('user_based', [False, True]),
    }
}

def test(calculate_ndcg, kwargs):
    kwargs['k'] = int(kwargs['k'])
    result = init_status()
    
    for (trainset, testset) in surprise_sets:
        np.random.seed = 0
        random.seed = 0
        algo = KNNWithZScore(verbose=False, **kwargs)
        algo.fit(trainset)
        predictions = algo.test(testset, verbose=False)
        predictions = pd.DataFrame(predictions, columns=['uid', 'iid', 'rating', 'prediction', 'meta'])
        
        passed_algo = algo if calculate_ndcg else None
        result = update_status(result, predictions, passed_algo)
        
    return merge_splits_status(result)

knn_trials = Trials()
best = fmin(lambda kwargs: test(False, kwargs),
    space=space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=knn_trials)

print(best)
final_score['KNN'] = (best, test(True, best))
print(final_score['KNN'])

100%|██████████| 100/100 [41:26<00:00, 20.53s/it, best loss: 0.929406375585494]
{'k': 99.02815802135144, 'name': 0, 'user_based': 0}
({'k': 99, 'name': 0, 'user_based': 0}, {'status': 'ok', 'loss': 0.9399767223640665, 'metrics': {'rmse': 0.9399767223640665, 'rmse_stddev': 0.0017070408561016626, 'mae': 0.7317590866827336, 'mae_stddev': 0.0014446018019666602, 'ndcg@5': 0.1974637325946109, 'ndcg@5_stddev': 0.007967492674335204, 'ndcg@20': 0.07932912841304458, 'ndcg@20_stddev': 0.0017906196419746293, 'ndcg@50': 0.06300582666859211, 'ndcg@50_stddev': 0.0013688701175645459}})


In [13]:
print('RESULTS')
for file_name, obj in zip(['YELP_SVD.log', 'YELP_CALS.log', 'YELP_ALS.log', 'YELP_KNN.log'],
                          [svd_trials, content_als_trials, als_trials, knn_trials]):
    with open(file_name, 'w') as file:
       file.write(str(obj.trials))
print(final_score)

RESULTS
{'SVD': ({'biased': 1, 'lr_all': 0.004569368257987339, 'n_factors': 16, 'reg_all': 0.15637897307701845}, {'status': 'ok', 'loss': 0.9190103130206801, 'metrics': {'rmse': 0.9190103130206801, 'rmse_stddev': 0.0021517852869819645, 'mae': 0.7216826137614722, 'mae_stddev': 0.0016951517077698275, 'ndcg@5': 0.2208674160927429, 'ndcg@5_stddev': 0.03219581929377904, 'ndcg@20': 0.09247275866053808, 'ndcg@20_stddev': 0.006298602174826437, 'ndcg@50': 0.07396060069331893, 'ndcg@50_stddev': 0.00360669159304824}}), 'CALS': ({'base_rank': 19, 'damping_factor': 6.0, 'user_reg_loss': 0.36849146706104613, 'item_reg_loss': 0.36849146706104613}, {'status': 'ok', 'loss': 0.9191913025427765, 'metrics': {'rmse': 0.9191913025427765, 'rmse_stddev': 0.002267470421852822, 'mae': 0.7216539375822536, 'mae_stddev': 0.0017359181481073833, 'ndcg@5': 0.21237048889267923, 'ndcg@5_stddev': 0.016642462223561968, 'ndcg@20': 0.09534493196479071, 'ndcg@20_stddev': 0.00461349187225295, 'ndcg@50': 0.07657091494715271, 

# Content influence

In [14]:
import copy

kwargs = copy.deepcopy(final_score['CALS'][0])
kwargs['base_rank'] = int(kwargs['base_rank'])
kwargs['damping_factor'] = int(kwargs['damping_factor'])
kwargs['item_reg_loss'] = kwargs['user_reg_loss']

result = init_status()

for (train, test) in my_sets:
    np.random.seed = 0
    random.seed = 0
    als = ContentALS(np.max(ratings['uid'])+1, np.max(items['iid'])+1,
                     np.zeros(shape=(np.max(ratings['uid'])+1, 1)), items.drop(['iid', 'name'], axis=1),
                     **kwargs)
    als.init_ratings(train)
    als.train(iterations=20)
    predictions = als.predict(test)
    
    result = update_status(result, predictions)
    
merge_splits_status(result)
result

{'status': 'ok',
 'loss': 0.9191915978699495,
 'metrics': {'rmse': 0.9191915978699495,
  'rmse_stddev': 0.0022674546119526707,
  'mae': 0.721654049730885,
  'mae_stddev': 0.0017358592076142472,
  'ndcg@5': [],
  'ndcg@5_stddev': 0,
  'ndcg@20': [],
  'ndcg@20_stddev': 0,
  'ndcg@50': [],
  'ndcg@50_stddev': 0}}

In [15]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()

pd.options.mode.chained_assignment = None

user_contrib = np.abs(als._user_factors[:, :als._user_profile_rank].dot(als._item_factors[:, :als._user_profile_rank].T))
item_contrib = np.abs(als._user_factors[:, als._item_rank_offset:].dot(als._item_factors[:, als._item_rank_offset:].T))
latent_contrib = np.abs(als._user_factors[:, als._user_profile_rank:als._item_rank_offset].dot(als._item_factors[:, als._user_profile_rank:als._item_rank_offset].T))
user_contrib = user_contrib[test['uid'], test['iid']]
item_contrib = item_contrib[test['uid'], test['iid']]
latent_contrib = latent_contrib[test['uid'], test['iid']]
sum_contrib = user_contrib + item_contrib + latent_contrib
sum_contrib[sum_contrib == 0] = 1
user_contrib /= sum_contrib
item_contrib /= sum_contrib
latent_contrib /= sum_contrib

hist_profile, edges_profile = np.histogram(user_contrib + item_contrib,
                                           bins=np.arange(101, dtype=np.float) / 100, density=True)
mean_profile = 1+np.argmax(np.cumsum(hist_profile)>50)
hist_latent, edges_latent = np.histogram(latent_contrib,
                                         bins=np.arange(101, dtype=np.float) / 100, density=True)
mean_latent = 1+np.argmax(np.cumsum(hist_latent)>50)
p = figure(tools='save', background_fill_color="#fafafa") #title='Wkład cech do oceny końcowej', 
p.quad(top=hist_profile, bottom=0, left=(100*edges_profile[:-1]), right=(100*edges_profile[1:]),
       fill_color="navy", line_color="white", alpha=0.3, legend='cechy jawne')
# p.quad(top=hist_latent, bottom=0, left=(100*edges_latent[:-1]), right=(100*edges_latent[1:]),
#        fill_color="sienna", line_color="white", alpha=0.3, legend='cechy ukryte')
p.line([mean_profile, mean_profile], [0, np.max(hist_profile)+0.35], line_color="navy", line_dash='dashed')
# p.line([mean_latent, mean_latent], [0, np.max(hist_latent)+0.35], line_color="sienna", line_dash='dashed')
p.xaxis.axis_label = 'Wkład [%]'
p.yaxis.axis_label = 'Liczba ocen [%]'
show(p)

# Cold start

In [26]:
kwargs = copy.deepcopy(final_score['CALS'][0])
kwargs['base_rank'] = int(kwargs['base_rank'])
kwargs['damping_factor'] = int(kwargs['damping_factor'])
kwargs['item_reg_loss'] = kwargs['user_reg_loss']

result_users = init_status()

for (train, test) in my_sets:
    np.random.seed = 0
    random.seed = 0
    als = ContentALS(np.max(ratings['uid'])+1, np.max(items['iid'])+1,
                            np.zeros(shape=(np.max(ratings['uid'])+1, 1)),
                            items.drop(['iid', 'name'], axis=1),
                            **kwargs)
    als.init_ratings(train)
    als.train(iterations=20)
    als._user_factors[:, als._user_profile_rank:als._item_rank_offset] = 0
    als._item_factors[:, als._user_profile_rank:als._item_rank_offset] = 0
    
    als_users = copy.deepcopy(als)
    als._user_factors[:, :als._user_profile_rank] = 0
    als._biases['user_bias']['user_bias'] = 0
    
    predictions = als_users.predict(test)
    result_users = update_status(result_users, predictions, als_users)
    
merge_splits_status(result_users)
print(result_users)

{'status': 'ok', 'loss': 0.9191906374322383, 'metrics': {'rmse': 0.9191906374322383, 'rmse_stddev': 0.0022670872018951955, 'mae': 0.7216533289425453, 'mae_stddev': 0.001735459166925107, 'ndcg@5': 0.21237048889267923, 'ndcg@5_stddev': 0.016642462223561968, 'ndcg@20': 0.09534493196479071, 'ndcg@20_stddev': 0.00461349187225295, 'ndcg@50': 0.07657107004129486, 'ndcg@50_stddev': 0.0038649450653699805}}


# Content effect

In [20]:
space = {
    'user_reg_loss': hp.loguniform('user_reg_loss', -8, 0),
    'damping_factor': hp.quniform('damping_factor', 0, 50, 1)
}

def test(calculate_ndcg, kwargs):
    kwargs['base_rank'] = 0
    kwargs['item_reg_loss'] = kwargs['user_reg_loss']
    result = init_status()
    
    for (train, test) in my_sets:
        np.random.seed = 0
        random.seed = 0
        algo = ContentALS(np.max(ratings['uid'])+1, np.max(items['iid'])+1,
                          np.zeros(shape=(np.max(ratings['uid'])+1, 1)),
                          items.drop(['iid', 'name'], axis=1),
                          **kwargs)
        algo.init_ratings(train)
        algo.train(iterations=20)
        predictions = algo.predict(test)
        
        passed_algo = algo if calculate_ndcg else None
        result = update_status(result, predictions, passed_algo)
        
    return merge_splits_status(result)

content_als_trials = Trials()
best = fmin(lambda kwargs: test(False, kwargs),
    space=space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=content_als_trials)

print(best, test(True, best))

100%|██████████| 100/100 [23:01<00:00, 12.50s/it, best loss: 0.9191911150378733]
{'damping_factor': 6.0, 'user_reg_loss': 0.3592481549304871, 'base_rank': 0, 'item_reg_loss': 0.3592481549304871} {'status': 'ok', 'loss': 0.9191911150378733, 'metrics': {'rmse': 0.9191911150378733, 'rmse_stddev': 0.0022666638562051588, 'mae': 0.7216509204800783, 'mae_stddev': 0.0017353607716984944, 'ndcg@5': 0.21230354210844612, 'ndcg@5_stddev': 0.0166147780249999, 'ndcg@20': 0.09524647937991276, 'ndcg@20_stddev': 0.004692620400732514, 'ndcg@50': 0.07659643599143057, 'ndcg@50_stddev': 0.0039007814265995563}}


In [25]:
space = {
    'damping_factor': hp.quniform('damping_factor', 0, 50, 1)
}

def test(calculate_ndcg, kwargs):
    kwargs['base_rank'] = 0
    kwargs['item_reg_loss'] = 0
    kwargs['user_reg_loss'] = 0
    result = init_status()
    
    for (train, test) in my_sets:
        np.random.seed = 0
        random.seed = 0
        algo = ContentALS(np.max(ratings['uid'])+1, np.max(ratings['iid'])+1,
                          np.zeros(shape=(np.max(ratings['uid'])+1, 1)),
                          np.zeros(shape=(np.max(ratings['iid'])+1, 1)),
                          **kwargs)
        algo.init_ratings(train)
        predictions = algo.predict(test)
        
        passed_algo = algo if calculate_ndcg else None
        result = update_status(result, predictions, passed_algo)
        
    return merge_splits_status(result)

content_als_trials = Trials()
best = fmin(lambda kwargs: test(False, kwargs),
    space=space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=content_als_trials)

print(best, test(True, best))

100%|██████████| 100/100 [09:05<00:00,  5.39s/it, best loss: 0.9218624840545615]
{'damping_factor': 13.0, 'base_rank': 0, 'item_reg_loss': 0, 'user_reg_loss': 0} {'status': 'ok', 'loss': 0.9218624840545615, 'metrics': {'rmse': 0.9218624840545615, 'rmse_stddev': 0.0022722825343410706, 'mae': 0.7266301509231712, 'mae_stddev': 0.0017443899890558113, 'ndcg@5': 0.22127677116177497, 'ndcg@5_stddev': 0.01151271664240359, 'ndcg@20': 0.10453150180507916, 'ndcg@20_stddev': 0.002859544140134871, 'ndcg@50': 0.0832267622964862, 'ndcg@50_stddev': 0.001413162947839539}}
