# Init

In [1]:
import random

import numpy as np
import pandas as pd

from surprise import Dataset, Reader, KNNWithZScore, SVD
from surprise.model_selection import KFold, cross_validate

from hyperopt import hp, tpe, fmin, Trials, STATUS_OK

from ContentALS import ContentALS
from Metrics import mae, rmse, ndcg, full_ndcg

In [2]:
final_score = {}

In [3]:
ITEMS_NUM = 1682 # used for monkey patching of missing functionality in surprise library

def predict_for_user(self, user):
    global ITEMS_NUM
    return np.array([self.predict(user, item).est for item in range(ITEMS_NUM)])
SVD.predict_for_user = predict_for_user
KNNWithZScore.predict_for_user = predict_for_user

In [4]:
DIVIDER = 2

users = pd.read_csv('dataset/MovieLens/users.csv')
items = pd.read_csv('dataset/MovieLens/items.csv')
ratings = pd.read_csv('dataset/MovieLens/ratings.csv')

users = pd.DataFrame(np.arange(np.max(ratings['uid'])+1), columns=['uid']).merge(users, on='uid', how='left').fillna(0)
items = pd.DataFrame(np.arange(np.max(ratings['iid'])+1), columns=['iid']).merge(items, on='iid', how='left').fillna(0)

for col in users.columns:
    if col != 'uid':
        users[col] /= DIVIDER
for col in items.columns:
    if (col != 'iid') and (col != 'title'):
        items[col] /= DIVIDER

In [5]:
SPLITS = 4 # number of k-fold splits
MAX_EVALS = 100 # number of trials in hyperparam search

np.random.seed = 0
random.seed = 0
data = Dataset.load_from_df(ratings[['uid', 'iid', 'rating']], Reader(rating_scale=[1, 5]))
kf = KFold(n_splits=SPLITS)

surprise_sets = [(trainset, testset) for trainset, testset in kf.split(data)]
my_sets = [(pd.DataFrame(trainset.build_testset(), columns=['uid', 'iid', 'rating']),
            pd.DataFrame(testset, columns=['uid', 'iid', 'rating'])) for (trainset, testset) in surprise_sets]

In [6]:
def init_status():
    empty_status = {
        'status': STATUS_OK,
        'loss': 0,
        'metrics': {
            'rmse': [],
            'rmse_stddev': 0,
            'mae': [],
            'mae_stddev': 0,
            'ndcg@5': [],
            'ndcg@5_stddev': 0,
            'ndcg@20': [],
            'ndcg@20_stddev': 0,
            'ndcg@50': [],
            'ndcg@50_stddev': 0,
        }
    }
    return empty_status

def update_status(status, predictions, algo=None):
    val = rmse(predictions)
    status['metrics']['rmse'].append(val)
        
    val = mae(predictions)
    status['metrics']['mae'].append(val)
    
    if algo is not None:
        val = full_ndcg(algo, predictions, 5)
        status['metrics']['ndcg@5'].append(val)

        val = full_ndcg(algo, predictions, 20)
        status['metrics']['ndcg@20'].append(val)

        val = full_ndcg(algo, predictions, 50)
        status['metrics']['ndcg@50'].append(val)
    
    return status

def merge_splits_status(status):
    keys = ['rmse', 'mae']
    if len(status['metrics']['ndcg@5']) > 0:
        keys.extend(['ndcg@5', 'ndcg@20', 'ndcg@50'])
    for key in keys:
        status['metrics'][f'{key}_stddev'] = np.std(status['metrics'][key])
        status['metrics'][key] = np.mean(status['metrics'][key])
    status['loss'] = status['metrics']['rmse']
    
    return status

# Verify - warm start

In [7]:
# SVD
space = {
    'n_factors': hp.loguniform('n_factors', 2.305, 5.2985),
    'biased': hp.choice('biased', [False, True]),
    'lr_all': hp.loguniform('lr_all', -8, 0),
    'reg_all': hp.loguniform('reg_all', -8, 0)
}

def test(calculate_ndcg, kwargs):
    kwargs['n_factors'] = int(kwargs['n_factors'])
    result = init_status()
    
    for (trainset, testset) in surprise_sets:
        np.random.seed = 0
        random.seed = 0
        algo = SVD(**kwargs)
        algo.fit(trainset)
        predictions = algo.test(testset, verbose=False)
        predictions = pd.DataFrame(predictions, columns=['uid', 'iid', 'rating', 'prediction', 'meta'])
        
        passed_algo = algo if calculate_ndcg else None
        result = update_status(result, predictions, passed_algo)
        
    return merge_splits_status(result)

svd_trials = Trials()
best = fmin((lambda kwargs: test(False, kwargs)),
    space=space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=svd_trials)

print(best)
final_score['SVD'] = (best, test(True, best))

100%|██████████| 100/100 [29:36<00:00, 15.71s/it, best loss: 0.9168524818675413]
{'biased': 1, 'lr_all': 0.02515576563691828, 'n_factors': 135.46837354077684, 'reg_all': 0.10560643303267994}


In [8]:
print(final_score['SVD'])

({'biased': 1, 'lr_all': 0.02515576563691828, 'n_factors': 135, 'reg_all': 0.10560643303267994}, {'status': 'ok', 'loss': 0.9168691935663278, 'metrics': {'rmse': 0.9168691935663278, 'rmse_stddev': 0.0023247399887409725, 'mae': 0.7251159270062572, 'mae_stddev': 0.0018694480112810775, 'ndcg@5': 0.207683709874842, 'ndcg@5_stddev': 0.006426165974539699, 'ndcg@20': 0.10073017459281026, 'ndcg@20_stddev': 0.0030286958052683248, 'ndcg@50': 0.10040038343764954, 'ndcg@50_stddev': 0.002935312220903598}})


In [9]:
# Hybrid ALS
space = {
    'base_rank': hp.loguniform('base_rank', 2.305, 5.2985),
    'user_reg_loss': hp.loguniform('user_reg_loss', -8, 0),
    'damping_factor': hp.quniform('damping_factor', 0, 50, 1)
}

def test(calculate_ndcg, kwargs):
    kwargs['base_rank'] = int(kwargs['base_rank'])
    kwargs['item_reg_loss'] = kwargs['user_reg_loss']
    result = init_status()
    
    for (train, test) in my_sets:
        np.random.seed = 0
        random.seed = 0
        algo = ContentALS(np.max(ratings['uid'])+1, np.max(ratings['iid'])+1,
                          users.drop(['uid'], axis=1), items.drop(['iid', 'title'], axis=1),
                          **kwargs)
        algo.init_ratings(train)
        algo.train(iterations=20)
        predictions = algo.predict(test)
        
        passed_algo = algo if calculate_ndcg else None
        result = update_status(result, predictions, passed_algo)
        
    return merge_splits_status(result)

content_als_trials = Trials()
best = fmin(lambda kwargs: test(False, kwargs),
    space=space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=content_als_trials)

print(best)
final_score['CALS'] = (best, test(True, best))
print(final_score['CALS'])

100%|██████████| 100/100 [1:55:47<00:00, 127.74s/it, best loss: 0.912754031836396] 
{'base_rank': 72.36815360690302, 'damping_factor': 15.0, 'user_reg_loss': 0.114049353390756}
({'base_rank': 72, 'damping_factor': 15.0, 'user_reg_loss': 0.114049353390756, 'item_reg_loss': 0.114049353390756}, {'status': 'ok', 'loss': 0.9127349107137441, 'metrics': {'rmse': 0.9127349107137441, 'rmse_stddev': 0.0019127406352909759, 'mae': 0.7186634798348154, 'mae_stddev': 0.0012489786695408455, 'ndcg@5': 0.23881282031659515, 'ndcg@5_stddev': 0.0015637175636543878, 'ndcg@20': 0.11748690878497557, 'ndcg@20_stddev': 0.0024356997083716555, 'ndcg@50': 0.11949456870506285, 'ndcg@50_stddev': 0.0025809745656272666}})


In [10]:
# ALS
space = {
    'base_rank': hp.loguniform('base_rank', 2.305, 5.2985),
    'user_reg_loss': hp.loguniform('user_reg_loss', -8, 0),
    'damping_factor': hp.quniform('damping_factor', 0, 50, 1)
}

def test(calculate_ndcg, kwargs):
    kwargs['base_rank'] = int(kwargs['base_rank'])
    kwargs['item_reg_loss'] = kwargs['user_reg_loss']
    result = init_status()
    for (train, test) in my_sets:
        np.random.seed = 0
        random.seed = 0
        algo = ContentALS(np.max(ratings['uid'])+1, np.max(ratings['iid'])+1,
                          np.zeros(shape=(np.max(ratings['uid'])+1, 1)),
                          np.zeros(shape=(np.max(ratings['iid'])+1, 1)),
                          **kwargs)
        algo.init_ratings(train)
        algo.train(iterations=20)
        predictions = algo.predict(test)
        
        passed_algo = algo if calculate_ndcg else None
        result = update_status(result, predictions, passed_algo)
        
    return merge_splits_status(result)

als_trials = Trials()
best = fmin(lambda kwargs: test(False, kwargs),
    space=space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=als_trials)

print(best)
final_score['ALS'] = (best, test(True, best))
print(final_score['CALS'])

100%|██████████| 100/100 [1:45:52<00:00, 60.40s/it, best loss: 0.9185805816766568]
{'base_rank': 147.6201707455593, 'damping_factor': 5.0, 'user_reg_loss': 0.13934153529184926}
({'base_rank': 72, 'damping_factor': 15.0, 'user_reg_loss': 0.114049353390756, 'item_reg_loss': 0.114049353390756}, {'status': 'ok', 'loss': 0.9127349107137441, 'metrics': {'rmse': 0.9127349107137441, 'rmse_stddev': 0.0019127406352909759, 'mae': 0.7186634798348154, 'mae_stddev': 0.0012489786695408455, 'ndcg@5': 0.23881282031659515, 'ndcg@5_stddev': 0.0015637175636543878, 'ndcg@20': 0.11748690878497557, 'ndcg@20_stddev': 0.0024356997083716555, 'ndcg@50': 0.11949456870506285, 'ndcg@50_stddev': 0.0025809745656272666}})


In [17]:
print(final_score['ALS'])

({'base_rank': 147, 'damping_factor': 5.0, 'user_reg_loss': 0.13934153529184926, 'item_reg_loss': 0.13934153529184926}, {'status': 'ok', 'loss': 0.918594114940745, 'metrics': {'rmse': 0.918594114940745, 'rmse_stddev': 0.0021818165567046607, 'mae': 0.723432749949058, 'mae_stddev': 0.0016439519024585186, 'ndcg@5': 0.23138278026296008, 'ndcg@5_stddev': 0.010159673059372806, 'ndcg@20': 0.11894260457469985, 'ndcg@20_stddev': 0.004169598267347314, 'ndcg@50': 0.12265773492168836, 'ndcg@50_stddev': 0.002991235201291466}})


In [11]:
#KNN
space = {
    'k': hp.uniform('k', 3, 100),
    'sim_options': {
        'name': hp.choice('name', ['cosine', 'msd', 'pearson']),
        'user_based': hp.choice('user_based', [False, True]),
    }
}

def test(calculate_ndcg, kwargs):
    kwargs['k'] = int(kwargs['k'])
    result = init_status()
    
    for (trainset, testset) in surprise_sets:
        np.random.seed = 0
        random.seed = 0
        algo = KNNWithZScore(verbose=False, **kwargs)
        algo.fit(trainset)
        predictions = algo.test(testset, verbose=False)
        predictions = pd.DataFrame(predictions, columns=['uid', 'iid', 'rating', 'prediction', 'meta'])
        
        passed_algo = algo if calculate_ndcg else None
        result = update_status(result, predictions, passed_algo)
        
    return merge_splits_status(result)

knn_trials = Trials()
best = fmin(lambda kwargs: test(False, kwargs),
    space=space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=knn_trials)

print(best)
final_score['KNN'] = (best, test(True, best))
print(final_score['KNN'])

100%|██████████| 100/100 [1:16:14<00:00, 49.33s/it, best loss: 0.9379937464928043]
{'k': 61.58978822457972, 'name': 1, 'user_based': 0}
({'k': 61, 'name': 1, 'user_based': 0}, {'status': 'ok', 'loss': 0.9519585101123711, 'metrics': {'rmse': 0.9519585101123711, 'rmse_stddev': 0.003038343432208273, 'mae': 0.7464922061646473, 'mae_stddev': 0.0028835842128253015, 'ndcg@5': 0.18233004488377924, 'ndcg@5_stddev': 0.05486633199301101, 'ndcg@20': 0.05136111506194807, 'ndcg@20_stddev': 0.0055638751615108084, 'ndcg@50': 0.044446893118780174, 'ndcg@50_stddev': 0.0028825600245897223}})


In [12]:
print('RESULTS')
for file_name, obj in zip(['ML_SVD.log', 'ML_CALS.log', 'ML_ALS.log', 'ML_KNN.log'],
                          [svd_trials, content_als_trials, als_trials, knn_trials]):
    with open(file_name, 'w') as file:
       file.write(str(obj.trials))
print(final_score)

RESULTS
{'SVD': ({'biased': 1, 'lr_all': 0.02515576563691828, 'n_factors': 135, 'reg_all': 0.10560643303267994}, {'status': 'ok', 'loss': 0.9168691935663278, 'metrics': {'rmse': 0.9168691935663278, 'rmse_stddev': 0.0023247399887409725, 'mae': 0.7251159270062572, 'mae_stddev': 0.0018694480112810775, 'ndcg@5': 0.207683709874842, 'ndcg@5_stddev': 0.006426165974539699, 'ndcg@20': 0.10073017459281026, 'ndcg@20_stddev': 0.0030286958052683248, 'ndcg@50': 0.10040038343764954, 'ndcg@50_stddev': 0.002935312220903598}}), 'CALS': ({'base_rank': 72, 'damping_factor': 15.0, 'user_reg_loss': 0.114049353390756, 'item_reg_loss': 0.114049353390756}, {'status': 'ok', 'loss': 0.9127349107137441, 'metrics': {'rmse': 0.9127349107137441, 'rmse_stddev': 0.0019127406352909759, 'mae': 0.7186634798348154, 'mae_stddev': 0.0012489786695408455, 'ndcg@5': 0.23881282031659515, 'ndcg@5_stddev': 0.0015637175636543878, 'ndcg@20': 0.11748690878497557, 'ndcg@20_stddev': 0.0024356997083716555, 'ndcg@50': 0.1194945687050628

# Content influence

In [13]:
import copy

kwargs = copy.deepcopy(final_score['CALS'][0])
kwargs['base_rank'] = int(kwargs['base_rank'])
kwargs['damping_factor'] = int(kwargs['damping_factor'])
kwargs['item_reg_loss'] = kwargs['user_reg_loss']

result = init_status()

for (train, test) in my_sets:
    np.random.seed = 0
    random.seed = 0
    als = ContentALS(np.max(ratings['uid'])+1, np.max(ratings['iid'])+1,
                     users.drop(['uid'], axis=1), items.drop(['iid', 'title'], axis=1),
                     **kwargs)
    als.init_ratings(train)
    als.train(iterations=20)
    predictions = als.predict(test)
    
    result = update_status(result, predictions, als)
    
merge_splits_status(result)
result

{'status': 'ok',
 'loss': 0.9126918501241377,
 'metrics': {'rmse': 0.9126918501241377,
  'rmse_stddev': 0.001980819537125757,
  'mae': 0.7186222086493137,
  'mae_stddev': 0.0013214173495108062,
  'ndcg@5': 0.2410253820263351,
  'ndcg@5_stddev': 0.0005473857176418391,
  'ndcg@20': 0.11768651215343327,
  'ndcg@20_stddev': 0.00207937407252669,
  'ndcg@50': 0.12006877050134886,
  'ndcg@50_stddev': 0.0022340226545179797}}

In [14]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()

pd.options.mode.chained_assignment = None

user_contrib = np.abs(als._user_factors[:, :als._user_profile_rank].dot(als._item_factors[:, :als._user_profile_rank].T))
item_contrib = np.abs(als._user_factors[:, als._item_rank_offset:].dot(als._item_factors[:, als._item_rank_offset:].T))
latent_contrib = np.abs(als._user_factors[:, als._user_profile_rank:als._item_rank_offset].dot(als._item_factors[:, als._user_profile_rank:als._item_rank_offset].T))
user_contrib = user_contrib[test['uid'], test['iid']]
item_contrib = item_contrib[test['uid'], test['iid']]
latent_contrib = latent_contrib[test['uid'], test['iid']]
sum_contrib = user_contrib + item_contrib + latent_contrib
sum_contrib[sum_contrib == 0] = 1
user_contrib /= sum_contrib
item_contrib /= sum_contrib
latent_contrib /= sum_contrib

hist_profile, edges_profile = np.histogram(user_contrib + item_contrib,
                                           bins=np.arange(101, dtype=np.float) / 100, density=True)
mean_profile = 1+np.argmax(np.cumsum(hist_profile)>50)
hist_latent, edges_latent = np.histogram(latent_contrib,
                                         bins=np.arange(101, dtype=np.float) / 100, density=True)
mean_latent = 1+np.argmax(np.cumsum(hist_latent)>50)
p = figure(tools='save', background_fill_color="#fafafa") #title='Wkład cech do oceny końcowej', 
p.quad(top=hist_profile, bottom=0, left=(100*edges_profile[:-1]), right=(100*edges_profile[1:]),
       fill_color="navy", line_color="white", alpha=0.3, legend='cechy jawne')
# p.quad(top=hist_latent, bottom=0, left=(100*edges_latent[:-1]), right=(100*edges_latent[1:]),
#        fill_color="sienna", line_color="white", alpha=0.3, legend='cechy ukryte')
p.line([mean_profile, mean_profile], [0, np.max(hist_profile)+0.35], line_color="navy", line_dash='dashed')
# p.line([mean_latent, mean_latent], [0, np.max(hist_latent)+0.35], line_color="sienna", line_dash='dashed')
p.xaxis.axis_label = 'Wkład [%]'
p.yaxis.axis_label = 'Liczba ocen [%]'
show(p)

# Learning curves

In [15]:
result_train = init_status()
result_test = init_status()
np.random.seed = 0
random.seed = 0

train, test = my_sets[3]
als = ContentALS(np.max(ratings['uid'])+1, np.max(ratings['iid'])+1,
                 users.drop(['uid'], axis=1), items.drop(['iid', 'title'], axis=1),
                 **kwargs)
np.random.seed = 0
random.seed = 0
als.init_ratings(train)
predictions = als.predict(train)
result_train = update_status(result_train, predictions)
predictions = als.predict(test)
result_test = update_status(result_test, predictions)

for i in range(20):
    als.train(iterations=1)
    predictions = als.predict(train)
    result_train = update_status(result_train, predictions)
    predictions = als.predict(test)
    result_test = update_status(result_test, predictions)
    
result_test

{'status': 'ok',
 'loss': 0,
 'metrics': {'rmse': [0.9518960368048786,
   0.9373414496714081,
   0.9233962707329795,
   0.9175351036949183,
   0.9159164072459238,
   0.9153767117546763,
   0.9151920112651314,
   0.9151542145151,
   0.9151835536820802,
   0.9152419257725172,
   0.915309285792389,
   0.9153753103457517,
   0.9154352948366125,
   0.9154876767190367,
   0.915532494078055,
   0.9155704805304786,
   0.915602581262636,
   0.9156297239060704,
   0.9156527287956758,
   0.9156722876142961,
   0.915688971242776],
  'rmse_stddev': 0,
  'mae': [0.7568198089813387,
   0.7411604440792462,
   0.7286922667991684,
   0.7229894437343617,
   0.7212497465027936,
   0.7205756169743854,
   0.7202781615439138,
   0.7201554065625956,
   0.7201260766244951,
   0.7201469333656266,
   0.7201816688798899,
   0.7202172814467331,
   0.7202554872370335,
   0.7202918184585035,
   0.7203265341707913,
   0.7203582083971746,
   0.7203851596942119,
   0.720407570739081,
   0.7204265439643931,
   0.7204427

In [16]:
p = figure(tools='save', background_fill_color="#fafafa")
p.line(np.arange(0, 21), result_train['metrics']['rmse'], line_color="navy", legend='zbiór treningowy')
p.line(np.arange(0, 21), result_test['metrics']['rmse'], line_color="sienna", legend='zbiór testowy')
show(p)

# Cold start

In [17]:
kwargs = copy.deepcopy(final_score['CALS'][0])
kwargs['base_rank'] = int(kwargs['base_rank'])
kwargs['damping_factor'] = int(kwargs['damping_factor'])
kwargs['item_reg_loss'] = kwargs['user_reg_loss']

result_users = init_status()
result_items = init_status()

for (train, test) in my_sets:
    np.random.seed = 0
    random.seed = 0
    als = ContentALS(np.max(ratings['uid'])+1, np.max(ratings['iid'])+1,
                     users.drop(['uid'], axis=1), items.drop(['iid', 'title'], axis=1),
                     **kwargs)
    als.init_ratings(train)
    als.train(iterations=20)
    als._user_factors[:, als._user_profile_rank:als._item_rank_offset] = \
        np.mean(als._user_factors[:, als._user_profile_rank:als._item_rank_offset], axis=0)
    als._item_factors[:, als._user_profile_rank:als._item_rank_offset] = \
        np.mean(als._item_factors[:, als._user_profile_rank:als._item_rank_offset], axis=0)
    
    als_users = copy.deepcopy(als)
    als_users._user_factors[:, :als._user_profile_rank] = \
        np.mean(als_users._user_factors[:, :als._user_profile_rank], axis=0)
    als_users._biases['user_bias']['user_bias'] = np.mean(als_users._biases['user_bias']['user_bias'])
    als_items = copy.deepcopy(als)
    als_items._item_factors[:, als._item_rank_offset:] = \
        np.mean(als_items._item_factors[:, als._item_rank_offset:], axis=0)
    als_items._biases['item_bias']['item_bias'] =  np.mean(als_users._biases['item_bias']['item_bias'])
    
    predictions = als_users.predict(test)
    result_users = update_status(result_users, predictions, als_users)
    predictions = als_items.predict(test)
    result_items = update_status(result_items, predictions, als_items)
    
merge_splits_status(result_users)
print(result_users)
merge_splits_status(result_items)
print(result_items)

{'status': 'ok', 'loss': 1.1151431197352966, 'metrics': {'rmse': 1.1151431197352966, 'rmse_stddev': 0.005747000635524216, 'mae': 0.9323760691898236, 'mae_stddev': 0.004803430087969068, 'ndcg@5': 0.19231686368859935, 'ndcg@5_stddev': 0.012040248601762291, 'ndcg@20': 0.08595028031454414, 'ndcg@20_stddev': 0.0026984025425964907, 'ndcg@50': 0.077027738046539, 'ndcg@50_stddev': 0.0018138455046181042}}
{'status': 'ok', 'loss': 0.942115386973154, 'metrics': {'rmse': 0.942115386973154, 'rmse_stddev': 0.004232761860249462, 'mae': 0.7474318599428555, 'mae_stddev': 0.003695834845456425, 'ndcg@5': 0.22148630219990736, 'ndcg@5_stddev': 0.001376134807535666, 'ndcg@20': 0.12045544695781708, 'ndcg@20_stddev': 0.004762398657495114, 'ndcg@50': 0.12211680070906064, 'ndcg@50_stddev': 0.002555681523906038}}


In [21]:
kwargs = copy.deepcopy(final_score['CALS'][0])
kwargs['base_rank'] = 0
kwargs['damping_factor'] = int(kwargs['damping_factor'])
kwargs['item_reg_loss'] = kwargs['user_reg_loss']

result = init_status()

for (train, test) in my_sets:
    np.random.seed = 0
    random.seed = 0
    als = ContentALS(np.max(ratings['uid'])+1, np.max(ratings['iid'])+1,
                     users.drop(['uid'], axis=1), items.drop(['iid', 'title'], axis=1),
                     **kwargs)
    als.init_ratings(train)
    als.train(iterations=20)
    
    predictions = als.predict(test)
    result = update_status(result, predictions, als)
    
merge_splits_status(result)
print(result)

{'status': 'ok', 'loss': 0.9334829025649243, 'metrics': {'rmse': 0.9334829025649243, 'rmse_stddev': 0.0024074068241489526, 'mae': 0.7377831813275038, 'mae_stddev': 0.0016645007941600543, 'ndcg@5': 0.19596390680617023, 'ndcg@5_stddev': 0.01942532793609843, 'ndcg@20': 0.09525991248922566, 'ndcg@20_stddev': 0.006936727279684461, 'ndcg@50': 0.09767286049305175, 'ndcg@50_stddev': 0.004952961048811953}}


# Content effect

In [33]:
space = {
    'user_reg_loss': hp.loguniform('user_reg_loss', -8, 0),
    'damping_factor': hp.quniform('damping_factor', 0, 50, 1)
}

def test(calculate_ndcg, kwargs):
    kwargs['base_rank'] = 0
    kwargs['item_reg_loss'] = kwargs['user_reg_loss']
    result = init_status()
    
    for (train, test) in my_sets:
        np.random.seed = 0
        random.seed = 0
        algo = ContentALS(np.max(ratings['uid'])+1, np.max(ratings['iid'])+1,
                          users.drop(['uid'], axis=1), items.drop(['iid', 'title'], axis=1),
                          **kwargs)
        algo.init_ratings(train)
        algo.train(iterations=20)
        predictions = algo.predict(test)
        
        passed_algo = algo if calculate_ndcg else None
        result = update_status(result, predictions, passed_algo)
        
    return merge_splits_status(result)

content_als_trials = Trials()
best = fmin(lambda kwargs: test(False, kwargs),
    space=space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=content_als_trials)

print(best, test(True, best))

100%|██████████| 100/100 [19:54<00:00, 11.02s/it, best loss: 0.9320658288535302]
{'damping_factor': 6.0, 'user_reg_loss': 0.13728519807608933, 'base_rank': 0, 'item_reg_loss': 0.13728519807608933} {'status': 'ok', 'loss': 0.9320658288535302, 'metrics': {'rmse': 0.9320658288535302, 'rmse_stddev': 0.0024125411788851053, 'mae': 0.7357284248094693, 'mae_stddev': 0.001795540438877661, 'ndcg@5': 0.1933729896184595, 'ndcg@5_stddev': 0.017185434905805465, 'ndcg@20': 0.09456264001354496, 'ndcg@20_stddev': 0.006500608705854403, 'ndcg@50': 0.09725701313074984, 'ndcg@50_stddev': 0.005036261030361663}}


In [34]:
space = {
    'damping_factor': hp.quniform('damping_factor', 0, 50, 1)
}

def test(calculate_ndcg, kwargs):
    kwargs['base_rank'] = 0
    kwargs['item_reg_loss'] = 0
    kwargs['user_reg_loss'] = 0
    result = init_status()
    
    for (train, test) in my_sets:
        np.random.seed = 0
        random.seed = 0
        algo = ContentALS(np.max(ratings['uid'])+1, np.max(ratings['iid'])+1,
                          np.zeros(shape=(np.max(ratings['uid'])+1, 1)),
                          np.zeros(shape=(np.max(ratings['iid'])+1, 1)),
                          **kwargs)
        algo.init_ratings(train)
        predictions = algo.predict(test)
        
        passed_algo = algo if calculate_ndcg else None
        result = update_status(result, predictions, passed_algo)
        
    return merge_splits_status(result)

content_als_trials = Trials()
best = fmin(lambda kwargs: test(False, kwargs),
    space=space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=content_als_trials)

print(best, test(True, best))

100%|██████████| 100/100 [06:01<00:00,  3.47s/it, best loss: 0.9429696368103112]
{'damping_factor': 3.0, 'base_rank': 0, 'item_reg_loss': 0, 'user_reg_loss': 0} {'status': 'ok', 'loss': 0.9429696368103112, 'metrics': {'rmse': 0.9429696368103112, 'rmse_stddev': 0.0020307115282937483, 'mae': 0.7463042716855562, 'mae_stddev': 0.0015791069873123287, 'ndcg@5': 0.22389316109497248, 'ndcg@5_stddev': 0.027283675267365526, 'ndcg@20': 0.11139353680896474, 'ndcg@20_stddev': 0.004791772717003524, 'ndcg@50': 0.11542576692056394, 'ndcg@50_stddev': 0.0021872441753655815}}
