In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [2]:
class MF:
    def __init__(self, Y_data, K, lam=0.1, learning_rate=0.5, max_iter=1000,
                 print_every=100, user_based=1):
        self.Y_raw_data = Y_data
        self.K = K
        self.lam = lam # Regularization parameter
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.print_every = print_every
        self.user_based = user_based
        self.n_users = np.max(Y_data[:, 0]) + 1
        self.n_items = np.max(Y_data[:, 1]) + 1
        self.n_ratings =Y_data.shape[0]
        self.X = np.random.randn(self.n_items, K)
        self.W = np.random.randn(K, self.n_users)
        self.b = np.random.randn(self.n_items)
        self.d = np.random.randn(self.n_users)
        
        self.Y_data_normalized = self.Y_raw_data.copy()
        
    
    def normalize_Y(self): # similar to NBCF
        if self.user_based:
            user_col = 0
            item_col = 1
            n_objects = self.n_users
        else:
            user_col = 1
            item_col = 0
            n_objects = self.n_items
        
        users = self.Y_raw_data[:, user_col] 
        self.mu = np.zeros((n_objects,))

        for n in range(n_objects):
            user_ids = np.where(users == n)[0]
            #item_ids = self.Y_data_normalized[user_ids, item_col]
            ratings = self.Y_data_normalized[user_ids, 2]
            
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0
            self.mu[n] = m
            self.Y_data_normalized[user_ids, 2] = ratings - self.mu[n]
    
    
    def loss(self):
        L = 0
        for i in range(self.n_ratings):
            n = self.Y_data_normalized[i, 0] # user index
            m = self.Y_data_normalized[i, 1] # item index
            rate = self.Y_data_normalized[i, 2]
            L += 0.5*(self.X[m, :].dot(self.W[:, n]) + self.b[m] + self.d[n] - rate)**2
        L /= self.n_ratings
        L += 0.5*self.lam*(np.linalg.norm(self.X, 'fro') + np.linalg.norm(self.W, 'fro'))
        return L
    
    
    def get_items_rated_by_user(self, user_id):
        user_ids = np.where(self.Y_data_normalized[:, 0] == user_id)[0]
        item_ids = self.Y_data_normalized[user_ids, 1]
        ratings = self.Y_data_normalized[user_ids, 2]
        return (item_ids, ratings)
            

    def get_users_who_rate_item(self, item_id):
        ids = np.where(self.Y_data_normalized[:,1] == item_id)[0] 
        user_ids = self.Y_data_normalized[ids, 0]
        ratings = self.Y_data_normalized[ids, 2]
        return (user_ids, ratings)
    
    
    def updateXb(self):
        for m in range(self.n_items):
            user_ids, ratings = self.get_users_who_rate_item(m)
            Wm = self.W[:,user_ids]
            dm = self.d[user_ids]
            grad_xm = (self.X[m, :].dot(Wm) + self.b[m] + dm - ratings).dot(Wm.T)/self.n_ratings + self.lam*self.X[m, :]
            grad_bm = np.sum(self.X[m, :].dot(Wm) + self.b[m] + dm - ratings)/self.n_ratings
            self.X[m, :] -= self.learning_rate*grad_xm
            self.b[m] -= self.learning_rate*grad_bm
            
    
    def updateWd(self):
        for n in range(self.n_users):
            item_ids, ratings = self.get_items_rated_by_user(n)
            Xn = self.X[item_ids, :]
            bn = self.b[item_ids]
            grad_wn = Xn.T.dot(Xn.dot(self.W[:, n]) + bn + self.d[n] - ratings)/self.n_ratings + self.lam*self.W[:, n]
            grad_dn = np.sum(Xn.dot(self.W[:, n]) + bn + self.d[n] - ratings)/self.n_ratings
            self.W[:, n] -= self.learning_rate*grad_wn
            self.d[n] -= self.learning_rate*grad_dn
            
            
    def fit(self):
        self.normalize_Y()
        for it in range(self.max_iter):
            self.updateXb()
            self.updateWd()
            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y_raw_data)
                print('Iter', it + 1, ': Loss =', self.loss(), 'RMSE train =', rmse_train)
    
    
    def pred(self, u, i):
        """
        Predict the rating of user u for item i
        """
        if self.user_based:
            bias = self.mu[u]
        else: 
            bias = self.mu[i]
        pred = self.X[i, :].dot(self.W[:, u]) + self.b[i] + self.d[u] + bias
        if pred < 0:
            return 0
        if pred > 5:
            return 5
        return pred
    
    
    def pred_for_user(self, user_id):
        """
        Predict ratings one user give all unrated items
        """
        ids = np.where(self.Y_data_normalized[:, 0] == user_id)[0]
        items_rated_by_u = self.Y_data_normalized[ids, 1].tolist()
        y_pred = self.X.dot(self.W[:, user_id]) + self.mu[user_id]
        predicted_ratings = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                predicted_ratings.append((i, y_pred[i]))
        return predicted_ratings
    
    
    def evaluate_RMSE(self, rate_test):
        n_test = rate_test.shape[0]
        SE = 0
        for n in range(n_test):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2
        RMSE = np.sqrt(SE/n_test)
        return RMSE

In [4]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.values
rate_test = ratings_test.values

rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [5]:
# user-based
rs = MF(rate_train, K = 50, lam = 0.01, print_every = 10, learning_rate = 50, max_iter = 100, user_based = 1)
rs.fit()
print('User-based MF: RMSE=', rs.evaluate_RMSE(rate_test))

Iter 10 : Loss = 0.4841081749193071 RMSE train = 1.2077939285898247
Iter 20 : Loss = 0.3436362382232493 RMSE train = 1.104496045736166
Iter 30 : Loss = 0.2905156405447963 RMSE train = 1.0581485944876774
Iter 40 : Loss = 0.2628084955829769 RMSE train = 1.0316478112645313
Iter 50 : Loss = 0.24590514104826142 RMSE train = 1.0144637202107618
Iter 60 : Loss = 0.23462139353878203 RMSE train = 1.0024693409568268
Iter 70 : Loss = 0.22663871598735946 RMSE train = 0.993700636113005
Iter 80 : Loss = 0.2207534910828691 RMSE train = 0.9870572434104572
Iter 90 : Loss = 0.21627631583929002 RMSE train = 0.9818812778049109
Iter 100 : Loss = 0.2127843322363196 RMSE train = 0.9777611200920617
User-based MF: RMSE= 1.0448392755913964


In [7]:
# item-based
rs = MF(rate_train, K = 50, lam = 0.01, print_every = 10, learning_rate = 50, max_iter = 100, user_based = 0)
rs.fit()
RMSE = rs.evaluate_RMSE(rate_test)
print('Item-based MF: RMSE=', RMSE)

Iter 10 : Loss = 0.4645295997241178 RMSE train = 1.1798520265945291
Iter 20 : Loss = 0.32649877913041425 RMSE train = 1.0711271889099436
Iter 30 : Loss = 0.27568453831116296 RMSE train = 1.0262428237866315
Iter 40 : Loss = 0.24978970107934437 RMSE train = 1.0025221378504436
Iter 50 : Loss = 0.2343354365792778 RMSE train = 0.9880238761888938
Iter 60 : Loss = 0.22421131988611998 RMSE train = 0.9783816104798011
Iter 70 : Loss = 0.21715896733764725 RMSE train = 0.9715935046017037
Iter 80 : Loss = 0.21202534297181388 RMSE train = 0.9666102493651005
Iter 90 : Loss = 0.20816117768576173 RMSE train = 0.9628333016336255
Iter 100 : Loss = 0.2051742894145638 RMSE train = 0.9598998049299224
Item-based MF: RMSE= 1.0451463578139952
