In [1]:
import os
os.chdir('../movies')
from movieLens import MovieLens

# Load the movie Lens class
ml = MovieLens()

# Algorithm

In [2]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
# from RBM import RBM

from surprise import accuracy
# import heapq
# from collections import defaultdict
# from operator import itemgetter
import numpy as np
import pandas as pd

In [3]:
# Load the ratings dataset
ratings = ml.ratings.copy()
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
print(ratings['rating'].nunique())
list(ratings['rating'].unique())

10


[4.0, 5.0, 3.0, 2.0, 1.0, 4.5, 3.5, 2.5, 0.5, 1.5]

In [5]:
# Method from the Surprise library to load the DataFrame 
# Define the Reader object to parse the dataframe
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))

# Load the dataframe as a ratings dataset
ratingsDataset = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Build the full trainset
trainSet, testSet = train_test_split(ratingsDataset, test_size=0.2, random_state=42)
antitest = trainSet.build_anti_testset()

# PRUEBA 1

Para preparar los datos de train y test, necesitamos crear conjuntos en formato de matriz con cada fila representando a un usuario y cada celda de la fila representando la valoración de cada película, dado que esta es la entrada esperada por el algoritmo RBM.

Para ello, necesitamos el número total de usuarios como números de fila y el número total de películas como número de columna
* num_users = trainSet.n_users
* num_movies = trainSet.n_items

In [12]:
def fitData(trainset):
    #AlgoBase.fit(self, trainset)

    num_users = trainset.n_users
    num_movies = trainset.n_items
    
    # 3D matrix: users, movies and ratings
    # Ratings has size 10 given the possible rating values
    trainingMatrix = np.zeros([num_users, num_movies, 10], dtype=np.float32)

    for (uid, iid, rating) in trainset.all_ratings():
        adjustedRating = int(float(rating)*2.0) - 1
        trainingMatrix[int(uid), int(iid), adjustedRating] = 1

    # Flatten to a 2D array, with nodes for each possible rating type on each possible item, for every user.
    trainingMatrix = np.reshape(trainingMatrix, [trainingMatrix.shape[0], -1])
    
    return trainingMatrix

In [13]:
trainingMatrix = fitData(trainSet)
trainingMatrix.shape

(610, 89280)

In [14]:
from RBM import RBM

# Create an RBM with (num items * rating values) visible nodes
model = RBM(trainingMatrix.shape[1])
model.train(trainingMatrix)

Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Trained epoch  10
Trained epoch  11
Trained epoch  12
Trained epoch  13
Trained epoch  14
Trained epoch  15
Trained epoch  16
Trained epoch  17
Trained epoch  18
Trained epoch  19


In [15]:
def getPredictions(rbm,num_users,num_movies,trainingMatrix):
    
    predictedRatings = np.zeros([num_users, num_movies], dtype=np.float32)
    
    for uiid in range(num_users):
        if (uiid % 50 == 0):
            print("Processing user ", uiid)
        recs = rbm.getRecommendations([trainingMatrix[uiid]])
        recs = np.reshape(recs, [num_movies, 10])

        for itemID, rec in enumerate(recs):
            # The obvious thing would be to just take the rating with the highest score:                
            #rating = rec.argmax()
            # ... but this just leads to a huge multi-way tie for 5-star predictions.
            # The paper suggests performing normalization over K values to get probabilities
            # and take the expectation as your prediction, so we'll do that instead:

            normalized = np.exp(rec)/np.sum(np.exp(rec), axis=0)
            rating = np.average(np.arange(10), weights=normalized)
            
            predictedRatings[uiid, itemID] = (rating + 1) * 0.5

    return predictedRatings

In [16]:
predictions = getPredictions(model,trainSet.n_users,trainSet.n_items,trainingMatrix)

Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Processing user  300
Processing user  350
Processing user  400
Processing user  450
Processing user  500
Processing user  550
Processing user  600


In [17]:
predictions

array([[2.773814 , 3.064343 , 2.8469872, ..., 2.7710757, 2.767178 ,
        2.7762666],
       [2.7737737, 3.064343 , 2.8469286, ..., 2.771031 , 2.7671473,
        2.7762048],
       [2.773733 , 3.0637488, 2.8468156, ..., 2.770986 , 2.7671165,
        2.776143 ],
       ...,
       [2.7716875, 3.0711865, 2.8488548, ..., 2.7715204, 2.7691114,
        2.775508 ],
       [2.7686617, 3.058133 , 2.8350325, ..., 2.7684906, 2.7659638,
        2.772562 ],
       [2.771769 , 3.071522 , 2.849195 , ..., 2.7715907, 2.769186 ,
        2.7755623]], dtype=float32)

In [18]:
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

# Prueba 2

In [19]:
from RBM_v2 import RBM_v2

# Create an RBM with (num items * rating values) visible nodes
model2 = RBM_v2(trainingMatrix.shape[1])
model2.train(trainingMatrix)

Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Trained epoch  10
Trained epoch  11
Trained epoch  12
Trained epoch  13
Trained epoch  14
Trained epoch  15
Trained epoch  16
Trained epoch  17
Trained epoch  18
Trained epoch  19


In [20]:
predictions = getPredictions(model2,trainSet.n_users,trainSet.n_items,trainingMatrix)

Processing user  0
Processing user  50
Processing user  100
Processing user  150
Processing user  200
Processing user  250
Processing user  300
Processing user  350
Processing user  400
Processing user  450
Processing user  500
Processing user  550
Processing user  600


In [21]:
predictions

array([[2.7759185, 3.0417857, 2.8223495, ..., 2.7892137, 2.7807577,
        2.754152 ],
       [2.7758682, 3.0417857, 2.8223426, ..., 2.78913  , 2.7806926,
        2.75415  ],
       [2.7758176, 3.041201 , 2.822266 , ..., 2.789046 , 2.7806275,
        2.7541478],
       ...,
       [2.77362  , 3.0498745, 2.8254151, ..., 2.7888005, 2.7819004,
        2.7549882],
       [2.7701852, 3.0389378, 2.8138025, ..., 2.7836561, 2.7772217,
        2.7534988],
       [2.7736878, 3.050157 , 2.825701 , ..., 2.7889109, 2.7819993,
        2.755025 ]], dtype=float32)

In [22]:
model2.test(testSet)

AttributeError: 'RBM_v2' object has no attribute 'trainset'

In [23]:
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

# Probar si con RBM AlgotBase se pueden sacar las métricas

In [24]:
from RBMAlgorithm import RBMAlgorithm
RBM = RBMAlgorithm(epochs=20)

In [25]:
RBM.test(testSet)

AttributeError: 'RBMAlgorithm' object has no attribute 'trainset'

# Metrics

In [6]:
import os
os.chdir('../metrics')
from metrics import evaluationMetrics
em = evaluationMetrics()

In [None]:
# Get test and antitest predictions
predtest = model.test(testSet)
predantitest = model.test(antitest)

# Get top N recommended movies for each user based on estimated ratings
top_10_RBM = em.getTopN(predantitest,minimumRating = 3.5)

## Métricas de precisión: RMSE y MAE

In [None]:
# # RMSE
rmse = accuracy.rmse(predtest)

# MAE
mae = accuracy.mae(predtest)

# rmse = accuracy.rmse(predictions)
# mae = accuracy.mae(predictions)

## Métricas de relevancia: Precision, Recall y NDCG

In [None]:
# Precision
precisions = em.getPrecision(predtest, k=10, threshold=3.5)

# Mean Average Precision
mapModel = np.mean(list(precisions.values()))

# Recall
recalls = em.getRecall(predtest, k=10, threshold=3.5)

# Mean Average Recall
marModel = np.mean(list(recalls.values()))

# Normalized discounted cummulative gain (NDCG)
ndcgs, mean_ndcg = em.getNDCG(predtest,10)

## Otras métricas de interés: Coverage, User Coverage y Novelty

In [None]:
# Coverage
coverage = em.getCoverage(top_10_RBM,trainSet.n_items,trainSet.all_users())

# User coverage
user_coverage = em.getUserCoverage(top_10_RBM, trainSet.n_users,4)

# Novelty
novelty = em.getNovelty(top_10_RBM,trainSet)

Por último creamos un dataframe con todas las métricas de evaluación asociadas al modelo

In [None]:
cols = ["Model","RMSE","MAE","MAP","MAR","Mean_NDCG","Coverage","User_Coverage","Novelty"]
metrics_data = []

# Append the results to the list of dictionaries
metrics_data.append({"Model": "RBM", "RMSE": rmse, "MAE": mae, "MAP": mapModel, "MAR": marModel,
                     "Mean_NDCG": mean_ndcg, "Coverage": coverage, "User_Coverage": user_coverage,
                     "Novelty": novelty})

# Convert the list of dictionaries into a DataFrame
metrics_df = pd.DataFrame(metrics_data, columns=cols)
metrics_df

# Random Algorithm

Vamos a evaluar también un modelo Random, en concreto "NormalPredictor" para poder comparar sus resultados con el resto de modelos. NormalPredictor es un algoritmo simple en Surprise que predice calificaciones aleatoriamente basado en la distribución del conjunto de entrenamiento. Supone una distribución normal de las calificaciones y genera predicciones aleatorias en función de esa distribución.

In [7]:
from surprise import NormalPredictor

# Create the model
Random = NormalPredictor()
Random.fit(trainSet)

<surprise.prediction_algorithms.random_pred.NormalPredictor at 0x207dc9e5790>

In [8]:
# Get test and antitest predictions
predtest_random = Random.test(testSet)
predantitest_random = Random.test(antitest)

# Get top N recommended movies for each user based on estimated ratings
top_10_random = em.getTopN(predantitest_random,minimumRating = 3.5)

## Compute metrics

In [9]:
# Accuracy Metrics
rmse_random = accuracy.rmse(predtest_random)
mae_random = accuracy.mae(predtest_random)

# Relevance metrics
precisions_random = em.getPrecision(predtest_random, k=10, threshold=3.5)
mapModel_random = np.mean(list(precisions_random.values()))

recalls_random = em.getRecall(predtest_random, k=10, threshold=3.5)
marModel_random = np.mean(list(recalls_random.values()))

ndcgs_random, mean_ndcg_random = em.getNDCG(predtest_random,10)

# Other metrics
coverage_random = em.getCoverage(top_10_random,trainSet.n_items,trainSet.all_users())
user_coverage_random = em.getUserCoverage(top_10_random, trainSet.n_users,4)
novelty_random = em.getNovelty(top_10_random,trainSet)

RMSE: 1.4320
MAE:  1.1429


In [10]:
cols = ["Model","RMSE","MAE","MAP","MAR","Mean_NDCG","Coverage","User_Coverage","Novelty"]
metrics_data = []

# Append the results to the list of dictionaries
metrics_data.append({"Model": "random", "RMSE": rmse_random, "MAE": mae_random, "MAP": mapModel_random, "MAR": marModel_random,
                     "Mean_NDCG": mean_ndcg_random, "Coverage": coverage_random, "User_Coverage": user_coverage_random,
                     "Novelty": novelty_random})

# Convert the list of dictionaries into a DataFrame
metrics_df = pd.DataFrame(metrics_data, columns=cols)
metrics_df

Unnamed: 0,Model,RMSE,MAE,MAP,MAR,Mean_NDCG,Coverage,User_Coverage,Novelty
0,random,1.432009,1.142851,0.629926,0.293679,0.931162,0.029682,1.0,1846.098361


In [11]:
# Add the results to the dataframe with the metrics of all models.
em.addToMetricsDataframe(metrics_df)