# AutoEncoders for recommender systems
Recommendations based on Deep Learning.

In [1]:
import os
import csv
import sys
import re

from surprise import Dataset
from surprise import Reader

from collections import defaultdict
import numpy as np

In [2]:
# define some paths to download data
ratingsPath = "../data/ml-latest-small/ratings.csv"
moviesPath = "../data/ml-latest-small/movies.csv"

# define reader instance to download data
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
# download dataset to path
ratingsDataset = Dataset.load_from_file(ratingsPath, reader=reader)

# now parse movies dataset
movieID_to_name = {}
name_to_movieID = {}
with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)
    for row in movieReader:
        movieID = int(row[0])
        movieName = row[1]
        movieID_to_name[movieID] = movieName
        name_to_movieID[movieName] = movieID

In [3]:
# get user ratings from ratingsDataset based on user
def getUserRatings(user):
    userRatings = []
    hitUser = False
    with open(ratingsPath, newline='') as csvfile:
        ratingReader = csv.reader(csvfile)
        next(ratingReader)
        for row in ratingReader:
            userID = int(row[0])
            if (user == userID):
                movieID = int(row[1])
                rating = float(row[2])
                userRatings.append((movieID, rating))
                hitUser = True
            if (hitUser and (user != userID)):
                break

    return userRatings

In [4]:
# now it is important to get popularity ranks to get some metrics
ratings = defaultdict(int)
rankings = defaultdict(int)
with open(ratingsPath, newline='') as csvfile:
    ratingReader = csv.reader(csvfile)
    next(ratingReader)
    for row in ratingReader:
        movieID = int(row[1])
        ratings[movieID] += 1
    rank = 1
    for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
        rankings[movieID] = rank
        rank +=1

## Define the AutoEncoder arquitecture

In [5]:
import numpy as np
import tensorflow as tf

In [6]:
print("Tensorflow version: ", tf.__version__)

Tensorflow version:  2.17.0


In [7]:
class AutoRec(object):
    def __init__(
            self,
            visibleDimensions,
            epochs=200,
            hiddenDimensions=50,
            learningRate=0.1,
            batchSize=100
            ):
        
       self.visibleDimensions = visibleDimensions
       self.epochs = epochs
       self.hiddenDimensions = hiddenDimensions
       self.learningRate = learningRate
       self.batchSize = batchSize
       self.optimizer = tf.keras.optimizers.RMSprop(self.learningRate)

    def Train(self, X):
        self.initialize_weights_biases()
        for epoch in range(self.epochs):
            for i in range(0, X.shape[0], self.batchSize):
                epochX = X[i:i+self.batchSize]
                self.run_optimization(epochX)
            print("Trained epoch", epoch)
    
    def GetRecommendations(self, inputUser):
        # feed a single user and get output from the output layer
        rec = self.neural_net(inputUser)
        # return type is a eager tensor
        return rec[0]
    
    def initialize_weights_biases(self,):
        # random initialize weights for hidden and output
        self.weights = {
            'h1': tf.Variable(tf.random.normal([self.visibleDimensions, self.hiddenDimensions])),
            'out': tf.Variable(tf.random.normal([self.hiddenDimensions, self.visibleDimensions]))
        }
        # same for biases
        self.biases = {
            'b1': tf.Variable(tf.random.normal([self.hiddenDimensions])),
            'out': tf.Variable(tf.random.normal([self.visibleDimensions]))
        }
    
    def neural_net(self, inputUser):
        # define arquitecture
        # input layer
        self.inputLayer = inputUser

        # hidden layer
        hidden = tf.nn.sigmoid(tf.add(tf.matmul(self.inputLayer, self.weights['h1']), self.biases['b1']))
        # output layer for predictions
        self.outputLayer = tf.nn.sigmoid(tf.add(tf.matmul(hidden, self.weights['out']), self.biases['out']))

        return self.outputLayer

    def run_optimization(self, inputUser):
        with tf.GradientTape() as g:
            pred = self.neural_net(inputUser)
            loss = tf.keras.losses.MSE(inputUser, pred)

        trainable_variables = list(self.weights.values()) + list(self.biases.values())
        gradients = g.gradient(loss, trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, trainable_variables))


### Define AutoRec wrapper with surprise

In [8]:
from surprise import AlgoBase
from surprise import PredictionImpossible

In [9]:
class AutoRecAlgorithm(AlgoBase):
    def __init__(self, epochs=100, hiddenDim=100, learningRate=0.01, batchSize=100, sim_options={}):
        AlgoBase.__init__(self)
        self.epochs = epochs
        self.hiddenDim = hiddenDim
        self.learningRate = learningRate
        self.batchSize = batchSize

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)

        numUsers = trainset.n_users
        numItems = trainset.n_items

        trainingMatrix = np.zeros([numUsers, numItems,], dtype=np.float32)
        for (uid, iid, rating) in trainset.all_ratings():
            # normalized ratings for not using softmax
            trainingMatrix[int(uid), int(iid)] = rating / 5.0

        # create RBM with (n_items * rating_values) visible nodes
        autoRec = AutoRec(
            trainingMatrix.shape[1],
            hiddenDimensions=self.hiddenDim,
            learningRate=self.learningRate,
            batchSize=self.batchSize,
            epochs=self.epochs
        )
        autoRec.Train(trainingMatrix)
        
        self.predictedRatings = np.zeros([numUsers, numItems], dtype=np.float32)
        for uiid in range(trainset.n_users):
            if (uiid % 50 == 0):
                print("Processing user ", uiid)
            recs = autoRec.GetRecommendations([trainingMatrix[uiid]])
            # take a normalized rating and re-escale it
            for itemID, rec in enumerate(recs):
                self.predictedRatings[uiid, itemID] = rec * 0.5
        
        return self
    
    def estimate(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown')
        
        rating = self.predictedRatings[u, i]
        if (rating < 0.001):
            raise PredictionImpossible('No valid prediction exists')
        
        return rating


## Get some recomendations based on AutoRec

In [10]:
import sys
sys.path.append('..')
from Framework.EvaluationData import EvaluationData
from Framework.RecommenderMetrics import RecommenderMetrics
import random

In [11]:
np.random.seed(0)
random.seed(0)

In [12]:
# define evaluation data
# get evaluation data
evaluationData = EvaluationData(ratingsDataset, rankings)
# define AutoRec instance
autorec = AutoRecAlgorithm()
autorec.fit(ratingsDataset.build_full_trainset())
predictions = autorec.test(evaluationData.GetTestSet())
print("RMSE SVD", RecommenderMetrics.RMSE(predictions))
print("MAE SVD", RecommenderMetrics.MAE(predictions))
print()

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Trained epoch 0
Trained epoch 1
Trained epoch 2
Trained epoch 3
Trained epoch 4
Trained epoch 5
Trained epoch 6
Trained epoch 7
Trained epoch 8
Trained epoch 9
Trained epoch 10
Trained epoch 11
Trained epoch 12
Trained epoch 13
Trained epoch 14
Trained epoch 15
Trained epoch 16
Trained epoch 17
Trained epoch 18
Trained epoch 19
Trained epoch 20
Trained epoch 21
Trained epoch 22
Trained epoch 23
Trained epoch 24
Trained epoch 25
Trained epoch 26
Trained epoch 27
Trained epoch 28
Trained epoch 29
Trained epoch 30
Trained epoch 31
Trained epoch 32
Trained epoch 33
Trained epoch 34
Trained epoch 35
Trained epoch 36
Trained epoch 37
Trained epoch 38
Trained epoch 39
Trained epoch 40
Trained epoch 41
Trained epoch 42
Trained epoch 43
Trained epoch 44
Trained epoch 45
Trained epoch 46
Trained epoch 47
Trained epoch 48
Trained epoch 49
Trained epoch 50
Trained epoch 51
Trained epoch 52
Tr

In [13]:
# funtion to ger movie name based on movie ID
def getMovieName(movieID):
    if movieID in movieID_to_name:
        return movieID_to_name[movieID]
    else:
        return ""

In [15]:
# see recomendations
# let's see some recommendations
testSubject = 85
k = 10

trainSet = evaluationData.GetFullTrainSet()
testSet = evaluationData.GetAntiTestSetForUser(testSubject)

predictions = autorec.test(testSet)
recommendations = []
for userID, movieID, actualRating, EstimatedRating, _ in predictions:
    intMovieID = int(movieID)
    recommendations.append((intMovieID, EstimatedRating))

recommendations.sort(key=lambda x: x[1], reverse=True)

print("#"*10, "AutoRec recommendations", "#"*10)
for ratings in recommendations[:k]:
    print(getMovieName(ratings[0]), ratings[1])
print()

########## AutoRec recommendations ##########
Grumpier Old Men (1995) 3.501556983616962
Heat (1995) 3.501556983616962
Seven (a.k.a. Se7en) (1995) 3.501556983616962
Usual Suspects, The (1995) 3.501556983616962
From Dusk Till Dawn (1996) 3.501556983616962
Bottle Rocket (1996) 3.501556983616962
Rob Roy (1995) 3.501556983616962
Desperado (1995) 3.501556983616962
Clerks (1994) 3.501556983616962
Dumb & Dumber (Dumb and Dumber) (1994) 3.501556983616962



## Tunning RBM

In [17]:
from surprise.model_selection import GridSearchCV

In [None]:
# define a param grid
param_grid = {'hiddenDim': [20, 10], 'learningRate': [0.1, 0.01]}
gs = GridSearchCV(AutoRecAlgorithm, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(ratingsDataset)

In [19]:
# now explore best metrics and params
print("Best RMSE score attained: ", gs.best_score['rmse'])
print(gs.best_params['rmse'])

Best RMSE score attained:  1.1342662605735316
{'hiddenDim': 10, 'learningRate': 0.1}
