In [32]:
import pandas as pd

from collections import defaultdict

import random
import numpy as np

from surprise import Dataset, Reader
from surprise import BaselineOnly

from surprise.model_selection import cross_validate
from surprise import SVD, KNNBasic, SVDpp, NMF, KNNBaseline, KNNWithMeans, KNNWithZScore
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise import dump

from sklearn.metrics.pairwise import cosine_similarity

# Funçoes Auxiliares

In [28]:
def build_svg_algo(prediction_algorithm , dataset, verbose=False):
    param_grid_svg = {
        'n_factors': [20, 50, 100],
        'n_epochs': [5, 10, 20, 30, 40, 50],
        "lr_all": [0.01, 0.2, 0.05, 0.001, 0.002, 0.005],
        "reg_all": [0.4, 0.6]
    }
    
    param_grid_nmf = {
        'n_factors': [20, 50, 100],
        'n_epochs': [5, 10, 20, 30, 40, 50],
        "lr_all": [0.01, 0.2, 0.05, 0.001, 0.002, 0.005],
        "reg_all": [0.4, 0.6]
    }
   
    param_grid = param_grid_nmf if type(prediction_algorithm) == NMF else param_grid_svg
   
    gs = GridSearchCV(
            prediction_algorithm,
            param_grid,
            measures=['rmse', 'mae'],
            cv=10,
            n_jobs=-1
    )
    
    gs.fit(dataset)
    
    if (verbose):
        print(f"SVD RMSE SCORE: {gs.best_score['rmse']}")
        print(f"SVD MAE SCORE: {gs.best_score['mae']}")
        print(f"SVD BEST PARAMS RMSE: {gs.best_params['rmse']} \n")
        print(f"SVD BEST PARAMS MAE: {gs.best_params['mae']} \n")

    algo = gs.best_estimator["rmse"]
    return algo


def build_knn_algo(prediction_algorithm, dataset, verbose=False):
    
    param_grid = {
        'k': [10, 20, 30, 40, 50, 100, 150, 200],
        'sim_options': {
            "name": ["msd", "cosine", "pearson"],
            "user_based": [False],
            'verbose' : [verbose]
        },
    }
   
    gs = GridSearchCV(
        prediction_algorithm,
        param_grid,
        measures=['rmse', 'mae'],
        cv=10,
        n_jobs=1
    )
    
    gs.fit(dataset)
    
    if (verbose):
        print(f"KNN RMSE SCORE: {gs.best_score['rmse']}")
        print(f"KNN MAE SCORE: {gs.best_score['mae']}")
        print(f"KNN BEST PARAMS: {gs.best_params['rmse']} \n")
        print(f"KNN BEST PARAMS: {gs.best_params['mae']} \n")

    algo = gs.best_estimator["rmse"]
    return algo
    

In [3]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [33]:
def fit_predict_save(algo, train_set, test_set, n_predictions, dump_path):
    algo.fit(train_set)
    predictions_algo = algo.test(test_set)
    top_n_predictions = get_top_n(predictions_algo, n=n_predictions)
    
    dump.dump(dump_path, predictions_algo, algo)
    
    return predictions_algo, top_n_predictions

# Configurações

In [5]:
DATASET_PATH = "./datasets/Refined/dataset.parquet"
TEST_SIZE = .25

In [6]:
seed = 608
random.seed(seed)
np.random.seed(seed)

In [7]:
data = pd.read_parquet(DATASET_PATH)
data.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1,4.0
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",5,4.0
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",7,4.5
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",15,2.5
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",17,4.5


In [8]:
min_rating = data["rating"].min()
max_rating = data["rating"].max()

In [9]:
reader = Reader(rating_scale=(min_rating, max_rating))
dataset = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)

In [10]:
cross_validate(BaselineOnly(), dataset, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8736  0.8724  0.8714  0.8796  0.8646  0.8723  0.0048  
MAE (testset)     0.6741  0.6729  0.6700  0.6776  0.6681  0.6725  0.0033  
Fit time          0.38    0.46    0.41    0.36    0.38    0.40    0.03    
Test time         0.33    0.16    0.19    0.16    0.09    0.19    0.08    


{'test_rmse': array([0.87361888, 0.87242742, 0.87135281, 0.87955244, 0.86462855]),
 'test_mae': array([0.67409083, 0.67289043, 0.66995537, 0.67764192, 0.66810451]),
 'fit_time': (0.38080358505249023,
  0.4603300094604492,
  0.4133784770965576,
  0.3617537021636963,
  0.38141942024230957),
 'test_time': (0.32924795150756836,
  0.16267824172973633,
  0.1921064853668213,
  0.16463232040405273,
  0.0883629322052002)}

In [11]:
train_set, test_set = train_test_split(dataset, test_size=TEST_SIZE)

## Matrix Factorization

In [31]:
svd_algo = build_svg_algo(SVD, dataset, verbose=True)
predictions_svd, top_n_svd = fit_predict_save(
    svd_algo,
    train_set,
    test_set,
    10,
    "./algo/svd_algo"
)


for uid, user_ratings in top_n_svd.items():
    if(uid == 1):
        print([data[data["movieId"]==iid]["title"].values[0] for (iid, _) in user_ratings])

SVD RMSE SCORE: 0.8757767254296172
SVD MAE SCORE: 0.6757611050144046
SVD BEST PARAMS RMSE: {'n_factors': 50, 'n_epochs': 50, 'lr_all': 0.005, 'reg_all': 0.4} 

SVD BEST PARAMS MAE: {'n_factors': 50, 'n_epochs': 50, 'lr_all': 0.005, 'reg_all': 0.4} 

['Goodfellas (1990)', 'Apocalypse Now (1979)', 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)', 'Monty Python and the Holy Grail (1975)', "Schindler's List (1993)", 'Fargo (1996)', 'Silence of the Lambs, The (1991)', 'Shining, The (1980)', 'Office Space (1999)', 'L.A. Confidential (1997)']


In [30]:
svd_pp_algo = build_svg_algo(SVDpp, dataset, verbose=True)
predictions_svd_pp, top_n_svd_pp = fit_predict_save(
    svd_pp_algo,
    train_set,
    test_set,
    10,
    "./algo/svd_plus_plus_algo"
)

for uid, user_ratings in top_n_svd_pp.items():
    if(uid == 1):
        print(uid, [data[data["movieId"]==iid]["title"].values[0] for (iid, _) in user_ratings])

KeyboardInterrupt: 

In [None]:
svd_nmf_algo = build_svg_algo(NMF, dataset, verbose=True)
predictions_svd_nmf, top_n_svd_nmf = fit_predict_save(
    svd_nmf_algo,
    train_set,
    test_set,
    10,
    "./algo/nmf_algo"
)

for uid, user_ratings in top_n_svd_nmf.items():
    if(uid == 1):
        print(uid, [data[data["movieId"]==iid]["title"].values[0] for (iid, _) in user_ratings])

# KNN

In [None]:
knn_basic_algo = build_knn_algo(KNNBasic, dataset, verbose=False)
predictions_knn_basic , top_n_knn_basic = fit_predict_save(
    knn_basic_algo,
    train_set,
    test_set,
    10,
    "knn_basic_algo"
)

for uid, user_ratings in top_n_knn_basic.items():
    if(uid == 1):
        print([data[data["movieId"]==iid]["title"].values[0] for (iid, _) in user_ratings])

In [None]:
knn_baseline_algo = build_knn_algo(KNNBaseline, dataset, verbose=False)
predictions_knn_baseline , top_n_knn_baseline = fit_predict_save(
    knn_baseline_algo,
    train_set,
    test_set,
    10,
    "knn_baseline_algo"
)

for uid, user_ratings in top_n_knn_baseline.items():
    if(uid == 1):
        print([data[data["movieId"]==iid]["title"].values[0] for (iid, _) in user_ratings])

In [None]:
knn_with_z_score_algo = build_knn_algo(KNNWithZScore, dataset, verbose=False)
predictions_knn_with_means , top_n_knn_with_z_score = fit_predict_save(
    knn_with_z_score_algo,
    train_set,
    test_set,
    10,
    "knn_with_z_score"
)

for uid, user_ratings in top_n_knn_with_z_score.items():
    if(uid == 1):
        print([data[data["movieId"]==iid]["title"].values[0] for (iid, _) in user_ratings])

In [None]:
knn_with_means_algo = build_knn_algo(KNNWithMeans, dataset, verbose=False)
predictions_knn_with_means, top_n_knn_with_means = fit_predict_save(
    knn_with_means_algo,
    train_set,
    test_set,
    10,
    "knn_with_means_algo"
)

for uid, user_ratings in top_n_knn_with_means.items():
    if(uid == 1):
        print([data[data["movieId"]==iid]["title"].values[0] for (iid, _) in user_ratings])