In [1]:
import os
os.chdir('../movies')
from movieLens import MovieLens

In [2]:
ml = MovieLens()

# Algorithm

In [17]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, KNNWithMeans, accuracy
from surprise.model_selection import train_test_split, GridSearchCV
import heapq
from collections import defaultdict
from operator import itemgetter

In [4]:
# Load the ratings dataset
ratings = ml.ratings.copy()

# Method from the Surprise library to load the DataFrame 
# Define the Reader object to parse the dataframe
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))

# Load the dataframe as a ratings dataset
ratingsDataset = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Build the full trainset
trainSet, testSet = train_test_split(ratingsDataset, test_size=0.2, random_state=42)
antitest = trainSet.build_anti_testset()

## Item user rating matrix

Matriz en la que encontramos los ratings por película para cada uno de los usuarios existentes. Una columna para las películas y una fila en la que se encuentran todos los usuarios disponibles. El valor de la celda corresponde con el rating otorgado a cada una de las películas por el usuario correspondiente

In [5]:
# Cosine similarity function
sim_options = {'name': 'cosine',   # alternative: pearson
               'user_based': False, # compute  similarities between films
               'min_support':5      # minimum number of common items between users
               }

model = KNNWithMeans(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


## Look up similar items

Buscamos las k películas que el usuario de referencia haya valorado mejor

In [6]:
def getNeighbors(referenceUser,k,trainSet):
     
    referenceUserInnerID = trainSet.to_inner_uid(referenceUser) 

    # Get top N items rated
    # Sort the elements in decreasing order by score and select top N
    referenceUserRatings = trainSet.ur[referenceUserInnerID]
    #print(referenceUserRatings)
    kNeighbors = heapq.nlargest(k, referenceUserRatings, key=lambda t: t[1])
    
    return kNeighbors

In [7]:
# Reference user = user to recommend to
referenceUser = 1 

# Set the number of desired similar users
k = 10

In [8]:
# Get neighbours
kNeighbors = getNeighbors(referenceUser,10,trainSet)
kNeighbors

[(587, 5.0),
 (18, 5.0),
 (126, 5.0),
 (85, 5.0),
 (1075, 5.0),
 (1198, 5.0),
 (589, 5.0),
 (991, 5.0),
 (1579, 5.0),
 (95, 5.0)]

De esta forma se seleccionan las k primeras películas con mayor puntuación, en este caso se seleccionan las 10 primeras, pero, no deberíamos seleccionar un número variable de k de tal forma que se seleccionen todas aquellas que tengan un rating máximo (5)?

## Candidate generation and scoring

Selecionamos las películas que podríamos recomendar en primera instancia. Para ello normalizamos los ratings y multiplicamos por el coeficiente de semejanza entre las películas

In [9]:
# Get similar items to stuff we liked (weighted by rating)
candidates = defaultdict(float)
for itemID, rating in kNeighbors:
    similarityRow = simsMatrix[itemID]
    for innerID, score in enumerate(similarityRow):
#         print(score)
#         print(rating)
        candidates[innerID] += score*(rating/5.0)

# Sort the candidates by score
candidates = sorted(candidates.items(), key=itemgetter(1), reverse=True)
#candidates

El score corresponde con la medida cosine similarity entre las películas calculada en la matriz de similaridad, mientras que rating corresponde con la valoración del usuario

## Candidate filtering and recommendations

Filtramos aquellas recomendaciones con un score pequeño y que ya haya visto el usuario. Para ello utilizamos un set porque únicamente nos interesa saber los items que el reference user ya ha visto, plus es un objeto eficiente para datasets largos

In [10]:
def filterRec(referenceUser,trainSet,k,candidates):
    
    # Get top N similar users to our reference user
    referenceUserInnerID = trainSet.to_inner_uid(referenceUser)
    
    # Build a set of movies the user has already seen
    watched = set(trainSet.ur[referenceUserInnerID])
    
    # Initialize a list to store the recommendations
    recommendations = []

    # Get top-rated items from similar users:
    pos = 0
    for itemID, ratingSum in candidates:
        if not itemID in watched:
            movieID = trainSet.to_raw_iid(itemID)
            recommendation = ml.getMovieName(int(movieID)), ratingSum
            recommendations.append(recommendation)
            pos += 1
            if (pos >= k):
                break            

    rec_movies = [rec[0] for rec in recommendations]
    return rec_movies

ratingSum represents the total similarity of the reference user to all other users who rated that item

In [11]:
# Results for the first approach
rec_movies = filterRec(referenceUser,trainSet,k,candidates)
rec_movies

['Star Wars: Episode V - The Empire Strikes Back',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb',
 'Apocalypse Now',
 'Chinatown',
 'Toy Story',
 'Fargo',
 'Blade Runner',
 'Citizen Kane',
 'L.A. Confidential',
 'Psycho']

# Metrics

In [12]:
from metrics import evaluationMetrics
em = evaluationMetrics()

In [13]:
# Get test and antitest predictions
predtest = model.test(testSet)
predantitest = model.test(antitest)

# Get top N recommended movies for each user based on estimated ratings
top_10_SVD = em.getTopN(predantitest,minimumRating = 3.5)

## Métricas de precisión: RMSE y MAE

In [15]:
# RMSE
rmse = accuracy.rmse(predtest)

# MAE
mae = accuracy.mae(predtest)

RMSE: 0.9172
MAE:  0.6978


## Métricas de relevancia: Precision, Recall y NDCG

In [22]:
# Precision
precisions = em.getPrecision(predtest, k=10, threshold=3.5)

# Mean Average Precision
mapModel = np.mean(list(precisions.values()))

# Recall
recalls = em.getRecall(predtest, k=10, threshold=3.5)

# Mean Average Recall
marModel = np.mean(list(recalls.values()))

# Normalized discounted cummulative gain (NDCG)
ndcgs, mean_ndcg = em.getNDCG(predtest,10)

## Otras métricas de interés: Coverage, User Coverage y Novelty

In [19]:
# Coverage
coverage = em.getCoverage(top_10_SVD,trainSet.n_items,trainSet.all_users())

# User coverage
user_coverage = em.getUserCoverage(top_10_SVD, trainSet.n_users,4)

# Novelty
novelty = em.getNovelty(top_10_SVD,trainSet)

Por último creamos un dataframe con todas las métricas de evaluación asociadas al modelo

In [23]:
cols = ["Model","RMSE","MAE","MAP","MAR","Mean_NDCG","Coverage","User_Coverage","Novelty"]
metrics_data = []

# Append the results to the list of dictionaries
metrics_data.append({"Model": "item-based", "RMSE": rmse, "MAE": mae, "MAP": mapModel, "MAR": marModel,
                     "Mean_NDCG": mean_ndcg, "Coverage": coverage, "User_Coverage": user_coverage,
                     "Novelty": novelty})

# Convert the list of dictionaries into a DataFrame
metrics_df = pd.DataFrame(metrics_data, columns=cols)
metrics_df

Unnamed: 0,Model,RMSE,MAE,MAP,MAR,Mean_NDCG,Coverage,User_Coverage,Novelty
0,item-based,0.917171,0.697839,0.743048,0.447952,0.951819,0.020721,1.0,5853.927869


* **Coverage**: las recomendaciones son poco variadas, en torno a un 2%.
* **User Coverage**: aquí vemos como el modelo encuentra para todos sus usuarios al menos una recomendación por encima de 4 en su predicción
* **Novelty**: recomendaciones que han sido pocas veces puntuadas; ranking promedio en torno a la película 5853 más popular (de 8000 posibles) 