In [1]:
import os
os.chdir('../movies')
from movieLens import MovieLens

# Load the movie Lens class
ml = MovieLens()

# Algorithm

In [2]:
from surprise import Dataset, Reader, accuracy, AlgoBase, PredictionImpossible, NormalPredictor
from surprise.model_selection import train_test_split
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
# Load the ratings dataset
ratings = ml.ratings.copy()
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
print(ratings['rating'].nunique())
list(ratings['rating'].unique())

10


[4.0, 5.0, 3.0, 2.0, 1.0, 4.5, 3.5, 2.5, 0.5, 1.5]

In [5]:
# Method from the Surprise library to load the DataFrame 
# Define the Reader object to parse the dataframe
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))

# Load the dataframe as a ratings dataset
ratingsDataset = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Build the full trainset
trainSet, testSet = train_test_split(ratingsDataset, test_size=0.2, random_state=42)
antitest = trainSet.build_anti_testset()

Para preparar los datos necesitamos crear un conjunto en formato de matriz con cada fila representando a un usuario y cada celda de la fila representando la valoración de cada película, dado que esta es la entrada esperada por el algoritmo

In [6]:
def fitData(trainset):

    num_users = trainset.n_users
    num_movies = trainset.n_items

    # 2D matrix: users and movies
    trainingMatrix = np.zeros([num_users, num_movies], dtype=np.float32)

    for (uid, iid, rating) in trainset.all_ratings():
        # Normalize the input ratings (0,1)
        trainingMatrix[int(uid), int(iid)] = rating/5.0
        
    return trainingMatrix

In [7]:
trainingMatrix = fitData(trainSet)
trainingMatrix.shape

(610, 8928)

In [22]:
"""
Updated on Sun Dec 1 08:32:13 2019

@author: Frank
@modified: Saurabh

@modified: Inés Gómez Fortis
"""

class AutoRec(AlgoBase):

    def __init__(self, visibleDimensions, epochs=200, hiddenDimensions=50, learningRate=0.1, batchSize=100):
        AlgoBase.__init__(self)
        self.visibleDimensions = visibleDimensions
        self.epochs = epochs
        self.hiddenDimensions = hiddenDimensions
        self.learningRate = learningRate
        self.batchSize = batchSize
        self.optimizer = tf.keras.optimizers.RMSprop(self.learningRate)
           
    
    def fit(self, trainset, trainingMatrix):
        
        AlgoBase.fit(self, trainset)
        self.Train(trainingMatrix)

        num_users = trainset.n_users
        num_movies = trainset.n_items
        
        self.predictedRatings = np.zeros([num_users, num_movies], dtype=np.float32)
        for uiid in range(num_users):
            if (uiid % 50 == 0):
                print("Processing user ", uiid)
            recs = self.GetRecommendations([trainingMatrix[uiid]])
            
            for itemID, rec in enumerate(recs):
                # Restore ratings to original ranges
                self.predictedRatings[uiid, itemID] = rec * 5.0
        
        return self   

    
    def Train(self, X):
        
        self.initializeWeightsBiases()
        for epoch in range(self.epochs):
            for i in range(0, X.shape[0], self.batchSize):
                epochX = X[i:i+self.batchSize]
                self.runOptimization(epochX)


            print("Trained epoch ", epoch)

    def GetRecommendations(self, inputUser):
                
        # Feed through a single user and return predictions from the output layer.
        rec = self.neuralNet(inputUser)
        
        # It is being used as the return type is Eager Tensor.
        return rec[0]
    
    def initializeWeightsBiases(self):
        # Create varaibles for weights for the encoding (visible->hidden) and decoding (hidden->output) stages, randomly initialized
        self.weights = {
            'h1': tf.Variable(tf.random.normal([self.visibleDimensions, self.hiddenDimensions])),
            'out': tf.Variable(tf.random.normal([self.hiddenDimensions, self.visibleDimensions]))
            }
        
        # Create biases
        self.biases = {
            'b1': tf.Variable(tf.random.normal([self.hiddenDimensions])),
            'out': tf.Variable(tf.random.normal([self.visibleDimensions]))
            }
    
    def neuralNet(self, inputUser):

        #tf.set_random_seed(0)
        
        # Initialization of weights and biases was moved out to the initialize_weights_biases function above
        # This lets us avoid resetting them on every batch of training, which was a bug in earlier versions of
        # this script.
        
        # Create the input layer
        self.inputLayer = inputUser
        
        # hidden layer
        hidden = tf.nn.sigmoid(tf.add(tf.matmul(self.inputLayer, self.weights['h1']), self.biases['b1']))
        
        # output layer for our predictions.
        self.outputLayer = tf.nn.sigmoid(tf.add(tf.matmul(hidden, self.weights['out']), self.biases['out']))
        
        return self.outputLayer
    
    def runOptimization(self, inputUser):
        with tf.GradientTape() as g:
            pred = self.neuralNet(inputUser)
            loss = tf.keras.losses.MSE(inputUser, pred)
            
        trainable_variables = list(self.weights.values()) + list(self.biases.values())
        
        gradients = g.gradient(loss, trainable_variables)
        
        self.optimizer.apply_gradients(zip(gradients, trainable_variables))
        
        
    def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')
        
        rating = self.predictedRatings[u, i]
        
        if (rating < 0.001):
            raise PredictionImpossible('No valid prediction exists.')
            
        return rating
    

In [23]:
# Create an RBM with (num items * rating values) visible nodes
model = AutoRec(trainingMatrix.shape[1])

# Fit the RBM model on the training set
model.fit(trainSet, trainingMatrix)

Trained epoch  0
Trained epoch  1
Trained epoch  2
Trained epoch  3
Trained epoch  4
Trained epoch  5
Trained epoch  6
Trained epoch  7
Trained epoch  8
Trained epoch  9
Trained epoch  10
Trained epoch  11
Trained epoch  12
Trained epoch  13
Trained epoch  14
Trained epoch  15
Trained epoch  16
Trained epoch  17
Trained epoch  18
Trained epoch  19
Trained epoch  20
Trained epoch  21
Trained epoch  22
Trained epoch  23
Trained epoch  24
Trained epoch  25
Trained epoch  26
Trained epoch  27
Trained epoch  28
Trained epoch  29
Trained epoch  30
Trained epoch  31
Trained epoch  32
Trained epoch  33
Trained epoch  34
Trained epoch  35
Trained epoch  36
Trained epoch  37
Trained epoch  38
Trained epoch  39
Trained epoch  40
Trained epoch  41
Trained epoch  42
Trained epoch  43
Trained epoch  44
Trained epoch  45
Trained epoch  46
Trained epoch  47
Trained epoch  48
Trained epoch  49
Trained epoch  50
Trained epoch  51
Trained epoch  52
Trained epoch  53
Trained epoch  54
Trained epoch  55
Tr

<__main__.AutoRec at 0x267316a14f0>

# Metrics

In [12]:
import os
os.chdir('../metrics')
from metrics import evaluationMetrics
em = evaluationMetrics()

In [13]:
# Get test and antitest predictions
predtest = model.test(testSet)
predantitest = model.test(antitest)

# Get top N recommended movies for each user based on estimated ratings
top_10_AutoRec = em.getTopN(predantitest, minimumRating = 3.5)

In [14]:
top_10_AutoRec

defaultdict(list,
            {432: [(8873, 4.9999986),
              (6935, 4.9999933),
              (52375, 4.9999685),
              (47, 4.280805),
              (7153, 3.963354),
              (3147, 3.627515),
              (4306, 3.5696414),
              (4351, 3.503229285466356),
              (111362, 3.503229285466356),
              (3868, 3.503229285466356)],
             288: [(8873, 4.9999847),
              (6935, 4.99998),
              (52375, 4.999696),
              (2571, 4.838406),
              (2959, 4.4526258),
              (2115, 4.305665),
              (480, 4.208844),
              (1265, 3.9356227),
              (7153, 3.7091186),
              (4226, 3.607554)],
             599: [(6935, 4.999991),
              (8873, 4.999953),
              (52375, 4.9998274),
              (1291, 3.7701962),
              (55276, 3.503229285466356),
              (3868, 3.503229285466356),
              (26736, 3.503229285466356),
              (204, 3.503229285466

## Métricas de precisión: RMSE y MAE

In [15]:
# # RMSE
rmse = accuracy.rmse(predtest)

# MAE
mae = accuracy.mae(predtest)

RMSE: 2.6319
MAE:  2.3002


## Métricas de relevancia: Precision, Recall y NDCG

In [16]:
# Precision
precisions = em.getPrecision(predtest, k=10, threshold=3.5)

# Mean Average Precision
mapModel = np.mean(list(precisions.values()))

# Recall
recalls = em.getRecall(predtest, k=10, threshold=3.5)

# Mean Average Recall
marModel = np.mean(list(recalls.values()))

# Normalized discounted cummulative gain (NDCG)
ndcgs, mean_ndcg = em.getNDCG(predtest,10)

## Otras métricas de interés: Coverage, User Coverage y Novelty

In [17]:
# Coverage
coverage = em.getCoverage(top_10_AutoRec,trainSet.n_items,trainSet.all_users())

# User coverage
user_coverage = em.getUserCoverage(top_10_AutoRec, trainSet.n_users,4)

# Novelty
novelty = em.getNovelty(top_10_AutoRec,trainSet)

Por último creamos un dataframe con todas las métricas de evaluación asociadas al modelo

In [18]:
cols = ["Model","RMSE","MAE","MAP","MAR","Mean_NDCG","Coverage","User_Coverage","Novelty"]
metrics_data = []

# Append the results to the list of dictionaries
metrics_data.append({"Model": "AutoRec", "RMSE": rmse, "MAE": mae, "MAP": mapModel, "MAR": marModel,
                     "Mean_NDCG": mean_ndcg, "Coverage": coverage, "User_Coverage": user_coverage,
                     "Novelty": novelty})

# Convert the list of dictionaries into a DataFrame
metrics_df = pd.DataFrame(metrics_data, columns=cols)
metrics_df

Unnamed: 0,Model,RMSE,MAE,MAP,MAR,Mean_NDCG,Coverage,User_Coverage,Novelty
0,AutoRec,2.631931,2.300162,0.534837,0.127198,0.933373,0.018817,1.0,2701.111148


In [19]:
# Add the results to the dataframe with the metrics of all models.
em.addToMetricsDataframe(metrics_df)