In [22]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [23]:
class MF(object):
    def __init__(self, Y_data, K, lam=0.1, Xinit=None, Winit=None, lr=0.5, max_iter=1000, print_every=100, user_based=1):
        self.Y_raw_data = Y_data
        self.K = K
        # regularization parameter
        self.lam = lam
        # learning rate for gradient descent
        self.lr = lr
        # maximum number of iterations
        self.max_iter = max_iter
        # print results after print_every iterations
        self.print_every = print_every
        # user-based or item-based
        self.user_based = user_based
        # number of users, items, and ratings. Remember to add 1 since id starts form 0
        self.n_users = int(np.max(Y_data[:, 0])) + 1
        self.n_items = int(np.max(Y_data[:, 1])) + 1
        self.n_ratings = Y_data.shape[0]
        
        if Xinit is None:
            self.X = np.random.randn(self.n_items, K)
        else: 
            self.X = Xinit
            
        if Winit is None:
            self.W = np.random.randn(K, self.n_users)
        else:
            self.W = Winit
            
        # normalized data, update later in normallized_Y function
        self.Y_data_n = self.Y_raw_data.copy()
        
    def normalize_Y(self):
        if self.user_based:
            user_col = 0
            item_col = 1
            n_objects = self.n_users
            
        # if we want to normalize based on item, just switch first two columns of data
        else:
            user_col = 1
            item_col = 0
            n_objects = self.n_items
            
        users = self.Y_raw_data[:, user_col]
        self.mu = np.zeros((n_objects,))
        for n in range(n_objects):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data_n[ids, item_col]
            # and the corresponding ratings
            ratings = self.Y_data_n[ids, 2]
            # take mean
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Y_data_n[ids, 2] = ratings - self.mu[n]
            
    def loss(self):
        L = 0
        for i in range(self.n_ratings):
            # user, item, rating
            n, m, rate = int(self.Y_data_n[i, 0]), int(self.Y_data_n[i, 1]), self.Y_data_n[i, 2]
            L += 0.5*(rate - self.X[m, :].dot(self.W[:, n]))**2
            
        # take average
        L /= self.n_ratings
        # regularization
        L += 0.5*self.lam*(np.linalg.norm(self.X, 'fro') + np.linalg.norm(self.W, 'fro'))
        return L
    
    def get_items_rated_by_user(self, user_id):
        """
        get all items which are rated by user user_id, and the corresponding ratings
        """
        ids = np.where(self.Y_data_n[:, 0] == user_id)[0]
        item_ids = self.Y_data_n[ids, 1].astype(np.int32)
        ratings = self.Y_data_n[ids, 2]
        return (item_ids, ratings)
    
    def get_users_who_rate_item(self, item_id):
        """
        get all users who rated item item_id and get the corresponding ratings
        """
        ids = np.where(self.Y_data_n[:, 1] == item_id)[0]
        user_ids = self.Y_data_n[ids, 0].astype(np.int32)
        ratings = self.Y_data_n[ids, 2]
        return (user_ids, ratings)
    
    def updateX(self):
        for m in range(self.n_items):
            user_ids, ratings = self.get_users_who_rate_item(m)
            Wm = self.W[:, user_ids]
            # gradient
            grad_xm = -(ratings - self.X[m, :].dot(Wm)).dot(Wm.T)/self.n_ratings + self.lam*self.X[m, :]
            self.X[m, :] -= self.lr*grad_xm.reshape((self.K,))
            
    def updateW(self):
        for n in range(self.n_users):
            item_ids, ratings = self.get_items_rated_by_user(n)
            Xn = self.X[item_ids, :]
            # gradient
            grad_wm = -Xn.T.dot(ratings - Xn.dot(self.W[:, n]))/self.n_ratings + self.lam*self.W[:, n]
            self.W[:, n] -= self.lr*grad_wm.reshape((self.K,))

    def fit(self):
        self.normalize_Y()
        for it in range(self.max_iter):
            self.updateX()
            self.updateW()
            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y_raw_data)
                print ('iter =', it + 1, ', loss =', self.loss(), ', RMSE train =', rmse_train)
                
    def pred(self, u, i):
        """
        predict the rating of user u for item i
        if you need the un
        """
        u = int(u)
        i = int(i)
        if self.user_based:
            bias = self.mu[u]
        else:
            bias = self.mu[i]
        pred = self.X[i, :].dot(self.W[:, u]) + bias
        # truncate if results are out of range [0, 5]
        if pred < 0:
            return 0
        if pred > 5:
            return 5
        return pred
    
    def pred_for_user(self, user_id):
        """
        predict ratings one user give all unrated items
        """
        ids = np.where(self.Y_data_n[:, 0] == user_id)[0]
        items_rated_by_u = self.Y_data_n[ids, 1].tolist()
        
        y_pred = self.X.dot(self.W[: user_id]) + self.mu[user_id]
        predicted_ratings = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                predicted_ratings.append((i, y_pred[i]))
                
        return predicted_ratings
    
    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0]
        SE = 0
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2
            
        RMSE = np.sqrt(SE/n_tests)
        return RMSE

In [24]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.values
rate_test = ratings_test.values

# indices start from 0
# rate_train[:, :2] -= 1
# rate_test[:, :2] -= 1

In [25]:
rs = MF(rate_train, K=10, lam=.1, print_every=10, lr=0.75, max_iter=100, user_based=1)
rs.fit()

iter = 10 , loss = 5.638501768416784 , RMSE train = 1.2082813651849553
iter = 20 , loss = 2.6318962004957926 , RMSE train = 1.0379857329497597
iter = 30 , loss = 1.3398929281366536 , RMSE train = 1.029529411451851
iter = 40 , loss = 0.7512589476577023 , RMSE train = 1.0292110434546553
iter = 50 , loss = 0.48149196080217505 , RMSE train = 1.0292102143517956
iter = 60 , loss = 0.35778775331372314 , RMSE train = 1.0292132401833765
iter = 70 , loss = 0.30105891092454445 , RMSE train = 1.029214076163222
iter = 80 , loss = 0.2750438513559969 , RMSE train = 1.0292142730230034
iter = 90 , loss = 0.26311371540170514 , RMSE train = 1.0292143179243967
iter = 100 , loss = 0.25764272746308026 , RMSE train = 1.0292143280626591


In [26]:
RMSE = rs.evaluate_RMSE(rate_test)
print('User-based MF, RMSE =', RMSE)

User-based MF, RMSE = 1.0603798989561035


In [28]:
# Chuan hoa dua tren item:
rs = MF(rate_train, K=10, lam=.1, print_every=10, lr=.75, max_iter=100, user_based=0)
rs.fit()

iter = 10 , loss = 5.624306265507605 , RMSE train = 1.1769947635507407
iter = 20 , loss = 2.616851378258404 , RMSE train = 1.0049328619679445
iter = 30 , loss = 1.3239071451008388 , RMSE train = 0.9964453753376745
iter = 40 , loss = 0.7347632074508645 , RMSE train = 0.9961649766499753
iter = 50 , loss = 0.4647469546755554 , RMSE train = 0.9961740516627544
iter = 60 , loss = 0.34092498872904275 , RMSE train = 0.9961792365938088
iter = 70 , loss = 0.2841412877690985 , RMSE train = 0.9961805282165835
iter = 80 , loss = 0.2581008286185066 , RMSE train = 0.9961808205943621
iter = 90 , loss = 0.2461589652529975 , RMSE train = 0.9961808854917167
iter = 100 , loss = 0.2406825695105039 , RMSE train = 0.9961808998148076


In [29]:
RMSE = rs.evaluate_RMSE(rate_test)
print('Item-based MF, RMSE =', RMSE)

Item-based MF, RMSE = 1.0498047474766652


In [30]:
# Khong su dung regularization => lam=0
rs = MF(rate_train, K=10, lam=0, print_every=10, lr=.75, max_iter=100, user_based=0)
rs.fit()
RMSE = rs.evaluate_RMSE(rate_test)
print('Item-based MF, RMSE =', RMSE)

iter = 10 , loss = 5.008111158239367 , RMSE train = 2.1330315600848158
iter = 20 , loss = 4.738749269778628 , RMSE train = 2.1119351809271993
iter = 30 , loss = 4.493455737514643 , RMSE train = 2.0911553101231086
iter = 40 , loss = 4.269347736692988 , RMSE train = 2.0707161198150197
iter = 50 , loss = 4.063969354204126 , RMSE train = 2.050634463976294
iter = 60 , loss = 3.8752170163473574 , RMSE train = 2.0309396271592246
iter = 70 , loss = 3.701279814129225 , RMSE train = 2.011644898692956
iter = 80 , loss = 3.5405913963927556 , RMSE train = 1.9926770447928284
iter = 90 , loss = 3.3917909200809695 , RMSE train = 1.9741479462013052
iter = 100 , loss = 3.2536911470852665 , RMSE train = 1.9560391483595476
Item-based MF, RMSE = 2.0397682685002843


In [18]:
# Apply to MovieLens 1M
r_col = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('archive-2/ratings.dat', sep='::', names=r_cols, encoding='latin-1')
ratings = ratings_base.values

  ratings_base = pd.read_csv('archive-2/ratings.dat', sep='::', names=r_cols, encoding='latin-1')


In [20]:
from sklearn.model_selection import train_test_split

rate_train, rate_test = train_test_split(ratings, test_size=0.33, random_state=42)
print(rate_train.shape, rate_test.shape)

(670140, 4) (330069, 4)


In [21]:
rs = MF(rate_train, K=2, lam=0.1, print_every=2, lr=2, max_iter=10, user_based=0)
rs.fit()
RMSE = rs.evaluate_RMSE(rate_test)
print('Item-based MF, RMSE =', RMSE)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


iter = 2 , loss = 6.755689218032276 , RMSE train = 1.1159295696738227
iter = 4 , loss = 4.323288464167259 , RMSE train = 1.0009970930409624
iter = 6 , loss = 2.832858344479268 , RMSE train = 0.977951188444279
iter = 8 , loss = 1.8901251129678633 , RMSE train = 0.9740304153465151
iter = 10 , loss = 1.2886585001304676 , RMSE train = 0.9733840440896254
Item-based MF, RMSE = 0.9815911974470984
