In [1]:
# import useful packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
# load data
data_dir = './data/'
ratings = pd.read_csv(data_dir+"ratings.dat", 
                      sep = "::", 
                      engine='python', 
                      header=None, 
                      names=['UserID','MovieID','Rating','Timestamp'])
users = pd.read_csv(data_dir+"users.dat", 
                      sep = "::", 
                      engine='python', 
                      header=None, 
                      names=['UserID','Gender','Age','Occupation','Zip-code'])
movies = pd.read_csv(data_dir+"movies.dat", 
                      sep = "::", 
                      engine='python', 
                      header=None, 
                      names=['MovieID','Title','Genres'])

#- Some MovieIDs do not correspond to a movie due to accidental duplicateentries and/or test entries
#- Movies are mostly entered by hand, so errors and inconsistencies may exist

In [3]:
# transform userID and movie Id
le = preprocessing.LabelEncoder()
le.fit(users.UserID)
le.classes_
users.UserID = le.transform(users.UserID) 
ratings.UserID = le.transform(ratings.UserID)

le = preprocessing.LabelEncoder()
le.fit(movies.MovieID)
le.classes_
movies.MovieID = le.transform(movies.MovieID) 
ratings.MovieID = le.transform(ratings.MovieID)

# split train and test set
R_train, R_test = train_test_split(ratings, test_size=0.3, random_state=42) 

In [4]:
train_size = R_train.shape[0]
test_size = R_test.shape[0]
num_users = len(set(users.UserID.values))
num_movies = len(set(movies.MovieID.values))


print(f"Size of Training set: {train_size}.")
print(f"Size of Test set: {test_size}.")
print(f"Number of Users: {num_users}.")
print(f"Number of Movies: {num_movies}")

Size of Training set: 700146.
Size of Test set: 300063.
Number of Users: 6040.
Number of Movies: 3883


In [47]:
class RecommenderSystem():
    
    def __init__(self, F, num_users, num_movies):
        np.random.seed(0)
        # initialize bias
        self.bu = np.random.normal(loc=0.0, scale=1e-4, size=num_users)
        self.bi = np.random.normal(loc=0.0, scale=1e-4, size=num_movies)  
        
        # initialize latent feature matrix
        self.pu = np.random.normal(loc=0.0, scale=1./max(1,np.sqrt(F)), size=[num_users,F])
        self.qi = np.random.normal(loc=0.0, scale=1./max(1,np.sqrt(F)), size=[num_movies,F])  
        
    def loss(self,R_new=None):
        if R_new is None:
            users = self.R['UserID'].values
            movies = self.R['MovieID'].values
            ratings = self.R['Rating'].values
        else:
            users = R_new['UserID'].values
            movies = R_new['MovieID'].values
            ratings = R_new['Rating'].values
    
        r_pred = self.miu + self.bu[users] + self.bi[movies] +np.sum(
            np.multiply(self.pu[users], self.qi[movies]),axis=1) 

        return np.mean(np.square(r_pred -ratings ))
    
    def train(self, R, penalty=1e-3, learning_rate=1e-3, tol=1e-5, epoch=10, batch_size= 2000):
        self.R = R
        self.miu = np.mean(self.R.Rating)
        
        loss0 = self.loss()
        loss_lst = []
        
        for e in range(epoch):
            for i in range(0, self.R.shape[0], batch_size):
                users = self.R['UserID'].values[i:i+batch_size]
                movies = self.R['MovieID'].values[i:i+batch_size]
                ratings = self.R['Rating'].values[i:i+batch_size]

                r_pred = miu + bu[self.R['UserID']] + bi[self.R['MovieID']]
                pred_error = ratings-r_pred[i:i+batch_size]
                
                self.bu[users] += learning_rate * pred_error
                self.bi[movies] += learning_rate * pred_error
                self.pu[users] += learning_rate*(np.multiply(
                    self.qi[movies], pred_error.reshape(-1,1))-penalty*self.pu[users])
                self.qi[movies] += learning_rate*(np.multiply(
                    self.pu[users], pred_error.reshape(-1,1))-penalty*self.qi[movies])
            
            loss = self.loss()
            loss_lst.append(loss)

            print(f"Epoch {e}: loss {loss}.")
            d = np.abs(loss-loss0)
            if d<tol*loss0:
                break
            loss0=loss
    
    def predict(self,R_new):
        return self.loss(R_new)

In [48]:
recommender = RecommenderSystem(F=0,num_users= num_users,num_movies=num_movies)
recommender.train(R_train, epoch=5, batch_size=2000, learning_rate =0.01)

Epoch 0: loss 1.0679758950031966.
Epoch 1: loss 0.9457522756203176.
Epoch 2: loss 0.8798357774916524.
Epoch 3: loss 0.8702264006172009.
Epoch 4: loss 0.9169241449969636.


With learning rate =0.01, the optimization process converge very fast.

In [49]:
recommender.predict(R_test)

0.9493841322331164

In [55]:
Fs = [1,3,5,7,9]
mse = []
for f in Fs:
    recommender = RecommenderSystem(F=f,num_users= num_users,num_movies=num_movies)
    recommender.train(R_train, epoch=10, batch_size=2000, learning_rate =0.001)
    mse.append(recommender.predict(R_test))

Epoch 0: loss 2.209493437232628.
Epoch 1: loss 2.190361438371178.
Epoch 2: loss 2.1729209286144426.
Epoch 3: loss 2.1571796562271905.
Epoch 4: loss 2.143148721800058.
Epoch 5: loss 2.1308426352953815.
Epoch 6: loss 2.1202793892689478.
Epoch 7: loss 2.111480548767233.
Epoch 8: loss 2.1044713585122006.
Epoch 9: loss 2.0992808681039317.
Epoch 0: loss 1.564895666958643.
Epoch 1: loss 1.5434580936881657.
Epoch 2: loss 1.5229915909565355.
Epoch 3: loss 1.5034967835699136.
Epoch 4: loss 1.484975563434647.
Epoch 5: loss 1.467431108743002.
Epoch 6: loss 1.4508679096365897.
Epoch 7: loss 1.435291800558143.
Epoch 8: loss 1.4207099995502943.
Epoch 9: loss 1.4071311548106544.
Epoch 0: loss 1.4292303462999405.
Epoch 1: loss 1.407118767378108.
Epoch 2: loss 1.385821067583781.
Epoch 3: loss 1.3653347299777476.
Epoch 4: loss 1.3456580083264864.
Epoch 5: loss 1.3267899237492653.
Epoch 6: loss 1.3087302652247417.
Epoch 7: loss 1.291479593951426.
Epoch 8: loss 1.275039251583362.
Epoch 9: loss 1.2594113723