## Ejemplo de FunkSVD usando Suprise

Ejemplo de FunkSVD usando Surprise

by [Denis Parra](https://dparra.sitios.ing.puc.cl), PUC Chile 


In [2]:
# In case you need to install surprise lib
#!pip install numpy
#!pip install scikit-surprise

In [3]:
#from surprise import Dataset

import os
from surprise import BaselineOnly, Dataset, Reader
from surprise import SVD
from surprise.model_selection import cross_validate

# Not working dataset download from grouplens
# data = Dataset.load_builtin('ml-100k')

# path to dataset file
file_path = os.path.expanduser("./ml-100k/u.data")

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format="user item rating timestamp", sep="\t")

data = Dataset.load_from_file(file_path, reader=reader)

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9314  0.9429  0.9361  0.9349  0.9378  0.9366  0.0038  
MAE (testset)     0.7337  0.7436  0.7370  0.7355  0.7391  0.7378  0.0034  
Fit time          0.88    1.26    1.00    0.87    0.82    0.97    0.16    
Test time         0.19    0.13    0.17    0.11    0.16    0.15    0.03    


{'test_rmse': array([0.93141509, 0.94290951, 0.9360651 , 0.93486451, 0.93777829]),
 'test_mae': array([0.73370185, 0.74360683, 0.73704959, 0.73549499, 0.73907764]),
 'fit_time': (0.8759799003601074,
  1.264150857925415,
  1.00242018699646,
  0.873894214630127,
  0.8215718269348145),
 'test_time': (0.1878058910369873,
  0.13430309295654297,
  0.17173504829406738,
  0.11315417289733887,
  0.1587841510772705)}

In [7]:
uid = str(196)  # raw user id (as in the ratings file). They are **strings**!
iid = str(302)  # raw item id (as in the ratings file). They are **strings**!

algo.predict(uid,iid)

Prediction(uid='196', iid='302', r_ui=None, est=4.0508332214303735, details={'was_impossible': False})

In [4]:
from surprise.model_selection import train_test_split

# split in train/test explicitly

trainset, testset = train_test_split(data, test_size=0.25)

# Create a new instance of FunkSVD
# Parametros por defecto:
# n_factors = 100
# n_epochs = 20
# lr_all = 0.005 (Learning rate para todos los parametros)
# reg_all = 0.02 (Término de regularización para todos los parametros)

n_epochs – The number of iteration of the SGD procedure. Default is 20.
algo2 = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo2.fit(trainset)

predictions = algo2.test(testset)

In [6]:
train_tuples = [x for x in trainset.all_ratings()]
train_tuples[:5]

[(0, 0, 3.0), (0, 170, 3.0), (0, 59, 3.0), (0, 520, 3.0), (0, 211, 4.0)]

In [10]:
test_users = [x[0] for x in testset]
test_users[:5]

['102', '334', '405', '932', '673']

In [220]:
from pprint import pprint
import inspect

data.raw_ratings[:5]

[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596')]

In [89]:
# Lista de predicciones para todo el test set
predictions[:5]

[Prediction(uid='790', iid='274', r_ui=3.0, est=2.713160316395964, details={'was_impossible': False}),
 Prediction(uid='938', iid='9', r_ui=3.0, est=3.737252322644245, details={'was_impossible': False}),
 Prediction(uid='116', iid='888', r_ui=2.0, est=2.9243884123784136, details={'was_impossible': False}),
 Prediction(uid='498', iid='1426', r_ui=3.0, est=2.865124085503024, details={'was_impossible': False}),
 Prediction(uid='352', iid='82', r_ui=3.0, est=3.4066871920059305, details={'was_impossible': False})]

In [205]:
# Predicción de rating para un usuario e item en particular
algo2.predict('501','27')

Prediction(uid='501', iid='27', r_ui=None, est=3.19176014964953, details={'was_impossible': False})

In [106]:
# Select predictions only for 2 users: 128 and 321
user_items = [item for item in predictions if (item.uid == str(128) or item.uid == str(321) )]

# sort predictions
user_items.sort(key = lambda x: (x.uid, x.est), reverse=True)

print(len(user_items))

# primeros 5 elementos
user_items[:5]

# últimos 5 elementos
#user_items[-5:]


77


[Prediction(uid='321', iid='173', r_ui=4.0, est=4.655715173255574, details={'was_impossible': False}),
 Prediction(uid='321', iid='174', r_ui=3.0, est=4.444313412512506, details={'was_impossible': False}),
 Prediction(uid='321', iid='474', r_ui=4.0, est=4.219669620318854, details={'was_impossible': False}),
 Prediction(uid='321', iid='483', r_ui=5.0, est=4.212309734581333, details={'was_impossible': False}),
 Prediction(uid='321', iid='50', r_ui=4.0, est=4.181992007314434, details={'was_impossible': False})]

In [108]:
# Binary Relevance:
# To replace all value in r_ui > 3 for 1 and 
from collections import namedtuple

Prediction = namedtuple("Prediction", ["uid", "iid", "r_ui", "est", "details"])
#list_uid_128 = list(map(lambda x: {'uid': x.uid, 'iid':x.iid, 'r_ui':1 if x.r_ui > 3 else 0, 'est': x.est,'details':x.details}, user_items[:5]))
user_items_bin = list(map(lambda x: Prediction( x.uid, x.iid, 1 if x.r_ui > 3 else 0, x.est, x.details), user_items))

user_items_bin[:4]

[Prediction(uid='321', iid='173', r_ui=1, est=4.655715173255574, details={'was_impossible': False}),
 Prediction(uid='321', iid='174', r_ui=0, est=4.444313412512506, details={'was_impossible': False}),
 Prediction(uid='321', iid='474', r_ui=1, est=4.219669620318854, details={'was_impossible': False}),
 Prediction(uid='321', iid='483', r_ui=1, est=4.212309734581333, details={'was_impossible': False})]

In [20]:
from surprise import accuracy

# Then compute RMSE
accuracy.rmse(predictions)


RMSE: 0.9418


0.9417653053530778

In [129]:
# Define function to obtain nDCG scores
def get_ndcg(surprise_predictions, k_highest_scores=None):
    """ 
    Calculates the ndcg (normalized discounted cumulative gain) from surprise predictions, using sklearn.metrics.ndcg_score and scipy.sparse
  
    Parameters: 
    surprise_predictions (List of surprise.prediction_algorithms.predictions.Prediction): list of predictions
    k_highest_scores (positive integer): Only consider the highest k scores in the ranking. If None, use all. 
  
    Returns: 
    float in [0., 1.]: The averaged NDCG scores over all recommendations
  
    """
    from sklearn.metrics import ndcg_score
    from scipy import sparse
    
    uids = [int(p.uid) for p in surprise_predictions ]
    iids = [int(p.iid) for p in surprise_predictions ]
    r_uis = [p.r_ui for p in surprise_predictions ]
    ests = [p.est for p in surprise_predictions ]
    
    assert(len(uids) == len(iids) == len(r_uis) == len(ests) )    
    
    sparse_preds = sparse.coo_matrix( (ests, (uids , iids )) )
    sparse_vals = sparse.coo_matrix( (r_uis, (uids , iids )) )
    
    dense_preds = sparse_preds.toarray()
    print(dense_preds)
    dense_vals = sparse_vals.toarray()
    
    return ndcg_score(y_true= dense_vals , y_score= dense_preds, k=k_highest_scores,ignore_ties=True)

In [193]:
get_ndcg(predictions,10)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


0.9039619765663592

In [196]:
# Calculate directly nDCG score for a particular user
user_items_bin = [item for item in predictions if (item.uid == str(555))]
user_items_bin.sort(key = lambda x: (x.uid))

r_uis = [p.r_ui for p in user_items_bin ]
ests = [p.est for p in user_items_bin ]

ndcg_score(y_true= [r_uis] , y_score= [ests], k=10,ignore_ties=False)

# get_ndcg(user_items_bin,10) # not working for a particular user or a few users

[[0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        4.1120978]]


0.0015732062952428106

In [198]:
# Calculate directly nDCG score 
# For 2 users

from sklearn.metrics import ndcg_score
from scipy import sparse
import numpy as np


uids = [int(p.uid) for p in user_items_bin ]
u_uids, u_indices = np.unique(uids, return_inverse=True)
rows = len(u_uids)
u_indices = [int(p) for p in u_indices]
u_indices = np.array(u_indices)
#print(u_indices,len(u_indices))

iids = [int(p.iid) for p in user_items_bin ]
u_iids, i_indices = np.unique(iids, return_inverse=True)
cols = len(u_iids)
i_indices = [int(p) for p in i_indices]
i_indices = np.array(i_indices)
#print(i_indices,len(i_indices))

r_uis = [p.r_ui for p in user_items_bin ]
r_uis = np.array(r_uis)
ests = [p.est for p in user_items_bin ]
ests = np.array(ests)
#print(ests)
#print(len(ests))

assert(len(uids) == len(iids) == len(r_uis) == len(ests) )    

sparse_preds = sparse.coo_matrix( (ests, (u_indices , i_indices )), shape=(rows,cols) )#.todense()
sparse_vals = sparse.coo_matrix( (r_uis, (u_indices , i_indices )), shape=(rows,cols) )#.todense()
    
dense_preds = sparse_preds.toarray()
dense_vals = sparse_vals.toarray()
print(dense_preds)


#ndcg_score(y_true= [r_uis] , y_score= [ests], k=None,ignore_ties=False)
ndcg_score(y_true= dense_vals , y_score= dense_preds, k=10, ignore_ties=True)

# get_ndcg(user_items_bin,10) # not working for a particular user or a few users

[[3.92922073 4.2884411  4.78378443 3.8266667  3.56094899 4.35374405
  3.22788836 3.65096477 4.16105958 4.11249613 3.38134509 4.62204807
  3.37287055 4.1120978 ]]
[[0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        4.1120978]]


0.0015732062952428106