In [193]:
import pandas as pd

from collections import defaultdict

import random
import numpy as np

from surprise import Dataset, Reader
from surprise import BaselineOnly


from surprise.model_selection import cross_validate
from surprise import SVD, KNNBasic
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from surprise.model_selection import train_test_split


from sklearn.metrics.pairwise import cosine_similarity

In [194]:
DATASET_PATH = "./datasets/Refined/dataset.parquet"
TEST_SIZE = .25

In [195]:
seed = 608
random.seed(seed)
np.random.seed(seed)

In [196]:
data = pd.read_parquet(DATASET_PATH)
data.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1,4.0
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",5,4.0
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",7,4.5
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",15,2.5
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",17,4.5


In [197]:
min_rating = data["rating"].min()
max_rating = data["rating"].max()

In [198]:
reader = Reader(rating_scale=(min_rating, max_rating))
dataset = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)

In [199]:
cross_validate(BaselineOnly(), dataset, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8736  0.8724  0.8714  0.8796  0.8646  0.8723  0.0048  
MAE (testset)     0.6741  0.6729  0.6700  0.6776  0.6681  0.6725  0.0033  
Fit time          0.32    0.43    0.32    0.35    0.36    0.36    0.04    
Test time         0.08    0.14    0.08    0.08    0.09    0.10    0.02    


{'test_rmse': array([0.87361888, 0.87242742, 0.87135281, 0.87955244, 0.86462855]),
 'test_mae': array([0.67409083, 0.67289043, 0.66995537, 0.67764192, 0.66810451]),
 'fit_time': (0.3194124698638916,
  0.42745018005371094,
  0.3191184997558594,
  0.34737348556518555,
  0.3645040988922119),
 'test_time': (0.08372378349304199,
  0.1390514373779297,
  0.08420252799987793,
  0.08168888092041016,
  0.08755373954772949)}

In [245]:
def build_algo(prediction_algorithm, dataset, algo_type):
    
    param_grid = {
        'n_factors': [20, 50, 100],
        'n_epochs': [5, 10, 20, 30, 40, 50],
        # "lr_all": [0.001,0.002, 0.005],
        # "reg_all": [0.4, 0.6]
    }
    
    if (algo_type == "knn"):
        print("knn")
        
        sim_options = {
            "name": ["msd", "cosine"],
            "user_based": [False],  # compute  similarities between items
            'min_support': [1, 5],
        }
        
        bsl_options =  {
            'method': ['als', 'sgd'],
            'reg': [1, 2],
        },
        
        param_grid = {
            # 'bsl_options': bsl_options,
            'sim_options': sim_options,
            'k': [10, 20],
            'n_epochs': [5, 10, 20, 30, 40, 50],
            # "lr_all": [0.001,0.002, 0.005],
            # "reg_all": [0.4, 0.6]
        }
   
    gs = GridSearchCV(prediction_algorithm, param_grid, measures=['rmse', 'mae'], cv=10, n_jobs=-1)
    gs.fit(dataset)
    
    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])

    # O algo possui os melhorar parametros
    algo = gs.best_estimator["rmse"]
    return algo

In [231]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [232]:
train_set, test_set = train_test_split(dataset, test_size=TEST_SIZE)

In [246]:
# svd_algo = build_algo(SVD, dataset,"svd")
knn_basic_algo = build_algo(KNNBasic, dataset, "knn")


knn
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [None]:
svd_algo.fit(train_set)
knn_basic_algo.fit(train_set)

print(svd_algo)

Computing the msd similarity matrix...
Done computing similarity matrix.
<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7f8bf7b36ec0>


In [None]:
predictions_svd = svd_algo.test(test_set)
predictions_knn_basic = knn_basic_algo.test(test_set)

In [None]:
top_n_svd = get_top_n(predictions_svd, n=10)
top_n_knn_basic = get_top_n(predictions_knn_basic, n=10)

In [None]:
# uid = user, iid = movie
print("SVD")
for uid, user_ratings in top_n_svd.items():
    if(uid == 1):
        print(uid, [data[data["movieId"]==iid]["title"].values[0] for (iid, _) in user_ratings])
        
print("KNNBasic")
for uid, user_ratings in top_n_knn_basic.items():
    if(uid == 1):
        print(uid, [data[data["movieId"]==iid]["title"].values[0] for (iid, _) in user_ratings])
        

SVD
1 ['Pulp Fiction (1994)', 'Star Wars: Episode VI - Return of the Jedi (1983)', 'Princess Bride, The (1987)', 'Star Wars: Episode IV - A New Hope (1977)', 'American History X (1998)', 'Goodfellas (1990)', 'Gladiator (2000)', 'Platoon (1986)', 'Shining, The (1980)', 'South Park: Bigger, Longer and Uncut (1999)']
KNNBasic
1 ['Star Wars: Episode VI - Return of the Jedi (1983)', 'Star Wars: Episode IV - A New Hope (1977)', 'Princess Bride, The (1987)', 'Gladiator (2000)', 'Full Metal Jacket (1987)', 'American History X (1998)', 'Seven (a.k.a. Se7en) (1995)', 'Pulp Fiction (1994)', 'Goodfellas (1990)', 'American Beauty (1999)']


In [None]:
movies = pd.read_csv(f'./datasets/raw/movies.csv', encoding='utf-8')

In [None]:
def filter_predictions_for_user(predictions, user_id, movies_df, top_k=10):
    top_preds = sorted([pred for pred in predictions if pred.uid == user_id], key=lambda pred: pred.est, reverse=True)[:top_k]
    movie_ids = [pred.iid for pred in top_preds]
    relevant_movies = movies_df[movies_df["movieId"].isin(movie_ids)]
    relevant_movies['rating'] = [pred.est for pred in top_preds]
    return relevant_movies

svd_redictions_for_user = filter_predictions_for_user(predictions_svd, 1, movies)
knn_basic_redictions_for_user = filter_predictions_for_user(predictions_knn_basic, 1, movies)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_movies['rating'] = [pred.est for pred in top_preds]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_movies['rating'] = [pred.est for pred in top_preds]


In [None]:
svd_redictions_for_user.head()

Unnamed: 0,movieId,title,genres,rating
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,5.0
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,5.0
829,1090,Platoon (1986),Drama|War,5.0
899,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,5.0
911,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,5.0


In [None]:
knn_basic_redictions_for_user.head()

Unnamed: 0,movieId,title,genres,rating
43,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,4.500944
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.432064
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.423593
899,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,4.396202
911,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,4.352958


In [None]:
def get_most_similar_movies_svd(movies_df, movie_embeddings, trainset, target_movie_id, top_k=10):
    inner_movie_id = trainset.to_inner_iid(target_movie_id)
    sims = cosine_similarity(movie_embeddings, movie_embeddings)
    target_movie_sims_sorted = [trainset.to_raw_iid(x) for x in np.argsort(sims[inner_movie_id])[::-1]][:top_k]
    most_similar_movies = movies_df[movies_df["movieId"].isin(target_movie_sims_sorted)]
    return most_similar_movies

def get_most_similar_movies_knn(movies_df, movie_embeddings, trainset, target_movie_id, top_k=10):
    inner_movie_id = trainset.to_inner_iid(target_movie_id)
    sims = cosine_similarity(movie_embeddings, movie_embeddings)
    sort = np.argsort(sims[inner_movie_id])[::-1]
    print(sort)
    target_movie_sims_sorted = [trainset.to_raw_iid(x) for x in sort]
    most_similar_movies = movies_df[movies_df["movieId"].isin(target_movie_sims_sorted)]
    return most_similar_movies

In [None]:
most_similar_movies_svd = get_most_similar_movies_svd(movies, svd_algo.qi, train_set, 1)
most_similar_movies_svd.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
374,430,Calendar Girl (1993),Comedy|Drama
449,514,"Ref, The (1994)",Comedy
788,1031,Bedknobs and Broomsticks (1971),Adventure|Children|Musical
1266,1681,Mortal Kombat: Annihilation (1997),Action|Adventure|Fantasy


In [None]:
print(len(movies))
print(train_set.n_ratings)
print(len(knn_basic_algo.sim))

most_similar_movies_knn = get_most_similar_movies_knn(movies, knn_basic_algo, train_set, 1)
most_similar_movies_knn.head()

9742
75627
610


TypeError: float() argument must be a string or a real number, not 'KNNBasic'

In [None]:
def GetTopN(predictions, n=10, minimumRating=4.0):
  topN = defaultdict(list)

  for userID, movieID, actualRating, estimatedRating, _ in predictions:
    if (estimatedRating >= minimumRating):
        topN[userID].append((movieID, estimatedRating))

  for userID, ratings in topN.items():
    ratings.sort(key=lambda x: x[1], reverse=True)
    topN[userID] = ratings[:n]

  return topN

GetTopN(predictions_svd)

defaultdict(list,
            {415: [(750, 4.572652677149075),
              (4226, 4.48627858070472),
              (1213, 4.438766357411412),
              (858, 4.431985869119864),
              (1201, 4.431642809178345),
              (3147, 4.4196567383071645),
              (2329, 4.412817222250138),
              (903, 4.390111010162316),
              (356, 4.388431649525072),
              (1203, 4.357252365133466)],
             246: [(1196, 4.745505229040945),
              (62336, 4.666462441258366),
              (31658, 4.603541579908343),
              (838, 4.574426205177313),
              (69481, 4.55827486031837),
              (28, 4.555416204525808),
              (1210, 4.551168840046505),
              (4963, 4.525134987025225),
              (6377, 4.52291793332523),
              (4993, 4.520389131913264)],
             610: [(741, 4.852845943862314),
              (4226, 4.739564845718506),
              (1198, 4.689770678259568),
              (608, 4.6771407

In [None]:
def generate_recommendation(model, user_id, data, n_items):
   # Get a list of all movie IDs from dataset
   movie_ids = data["movieId"].unique()
 
   # Get a list of all movie IDs that have been watched by user
   movie_ids_user = data.loc[data["userId"] == user_id, "movieId"]
    # Get a list off all movie IDS that that have not been watched by user
   movie_ids_to_pred = np.setdiff1d(movie_ids, movie_ids_user)
 
   # Apply a rating of 4 to all interactions (only to match the Surprise dataset format)
   test_set = [[user_id, movie_id, 4] for movie_id in movie_ids_to_pred]
 
   # Predict the ratings and generate recommendations
   predictions = model.test(test_set)
   pred_ratings = np.array([pred.est for pred in predictions])
   print("Top {0} item recommendations for user {1}:".format(n_items, user_id))
   # Rank top-n movies based on the predicted ratings
   index_max = (-pred_ratings).argsort()[:n_items]
   for i in index_max:
       movie_id = movie_ids_to_pred[i]
       print(data[data["movieId"]==movie_id]["title"].values[0], pred_ratings[i])
 
 
# define which user ID that we want to give recommendation
userID = 1
# define how many top-n movies that we want to recommend
n_items = 10
# generate recommendation using the model that we have trained

Top 10 item recommendations for user 1:
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) 5.0
Man Bites Dog (C'est arrivé près de chez vous) (1992) 5.0
Requiem for a Dream (2000) 5.0
Pianist, The (2002) 5.0
Seven Samurai (Shichinin no samurai) (1954) 5.0
Hustler, The (1961) 5.0
Blade Runner (1982) 5.0
Love and Death (1975) 5.0
Shawshank Redemption, The (1994) 5.0
Man Who Would Be King, The (1975) 5.0


In [None]:
print("SVD")
recommendation_svd = generate_recommendation(svd_algo, userID, data, n_items)

SVD
Top 10 item recommendations for user 1:
Wallace & Gromit: The Best of Aardman Animation (1996) 5.0
Harold and Maude (1971) 5.0
African Queen, The (1951) 5.0
Sting, The (1973) 5.0
Creature Comforts (1989) 5.0
Guess Who's Coming to Dinner (1967) 5.0
Life Is Beautiful (La Vita è bella) (1997) 5.0
Lord of the Rings: The Return of the King, The (2003) 5.0
To Catch a Thief (1955) 5.0
Solaris (Solyaris) (1972) 5.0


In [None]:
print("KNN")
recommendation_knn_basic = generate_recommendation(knn_basic_algo, userID, data, n_items)

KNN
Top 10 item recommendations for user 1:
Polish Wedding (1998) 5.0
Happy Feet Two (2011) 5.0
Bloodsucking Bastards (2015) 5.0
Asterix and the Vikings (Astérix et les Vikings) (2006) 5.0
Hard Core Logo (1996) 5.0
Into the Abyss (2011) 5.0
61* (2001) 5.0
The Editor (2015) 5.0
Vampire in Venice (Nosferatu a Venezia) (Nosferatu in Venice) (1986) 5.0
American Friend, The (Amerikanische Freund, Der) (1977) 5.0
