In [30]:
import pandas as pd
import numpy as np

In [None]:
class MFCF():
    def __init__(self, Y, K, lam = 0.1, 
                 Xinit = None, Winit = None, 
                 learning_rate = 0.5, max_iter = 1000, print_every = 100):
        self.Y = Y
        self.user = self.Y[:,0]
        self.n_users = int(np.max(self.user)) + 1
        self.item = self.Y[:,1]
        self.n_items = int(np.max(self.item)) + 1
        self.rating = self.Y[:,2]
        self.n_ratings = self.Y.shape[0]
        self.K = K
        self.lam = lam
        self.X = np.random.randn(self.n_items, self.K) if Xinit is None else Xinit
        self.W = np.random.randn(self.K, self.n_users) if Winit is None else Winit
        self.b = np.random.randn(self.n_items)
        self.d = np.random.randn(self.n_users)
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.print_every = print_every
    
    def loss(self):
        L = 0
        for i in range(self.n_ratings):
            n = int(self.Y[i,0])
            m = int(self.Y[i,1])
            rating = int(self.Y[i,2])
            L += 0.5 * (self.X[m].dot(self.W[:,n]) + self.b[m] + self.d[n] - rating)**2
        L /= self.n_ratings
        L += 0.5 * self.lam * (np.sum(self.X ** 2) + np.sum(self.W ** 2))
        return L
    
    def updateWd(self):
        for n in range(self.n_users):
            ids = np.where(self.user == n)[0]
            item_ids = self.Y[ids,1]
            rating = self.Y[ids,2]
            xn = self.X[item_ids] #(item_ids,K)
            bn = self.b[item_ids]
            for i in range(30):
                wn = self.W[:,n] #(K,1)
                dn = self.d[n] #(1)
                error = xn.dot(wn) + bn + dn - rating #(item_ids,1)
                grad_wn = xn.T.dot(error) / self.n_ratings + self.lam * wn #(K,1)
                grad_dn = np.sum(error) / self.n_ratings
                self.W[:,n] -= self.learning_rate * grad_wn.reshape(-1)
                self.d[n] -= self.learning_rate * grad_dn

    def updateXb(self):
        for m in range(self.n_items):
            ids = np.where(self.item == m)[0]
            user_ids = self.Y[ids,0]
            rating = self.Y[ids,2]
            wm = self.W[:,user_ids] #(K,user_ids)
            dm = self.d[user_ids] #(user_ids,)
            for i in range(30):
                xm = self.X[m] #(1,K)
                bm = self.b[m] #(1)
                error = xm.dot(wm) + bm + dm - rating #(user_ids,)
                grad_xm = wm.dot(error) / self.n_ratings + self.lam * xm
                grad_bm = np.sum(error) / self.n_ratings
                self.X[m] -= self.learning_rate * grad_xm
                self.b[m] -= self.learning_rate * grad_bm

    def fit(self):
        for it in range(self.max_iter):
            self.updateWd()
            self.updateXb()
            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y)
                print(f"iter = {it + 1}, loss = {self.loss()}, RMSE_train = {rmse_train}")

    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0]
        SE = 0
        for n in range(n_tests):
            pred = self.pred(rate_test[n,0], rate_test[n,1])
            SE += (pred - rate_test[n,2])**2
        RMSE = np.sqrt(SE / n_tests)
        return RMSE
    
    def pred(self, u, i):
        u = int(u)
        i = int(i)
        pred = self.X[i,:].dot(self.W[:,u]) + self.b[i] + self.d[u]
        return max(0, min(5, pred))

In [32]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv("ml-100k/ua.base", sep= '\t', names= r_cols)
ratings_test = pd.read_csv("ml-100k/ua.test", sep= '\t', names= r_cols)

rate_train = ratings_base.to_numpy()
rate_test = ratings_test.to_numpy()

rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

rs = MFCF(rate_train, K = 50, lam= .01, print_every= 5, learning_rate= 50, max_iter= 30)
rs.fit()
RMSE = rs.evaluate_RMSE(rate_test)
print(f"Matrix Factorization RMSE = {RMSE}")

iter = 5, loss = 0.4455446571517933, RMSE_train = 0.9437539822928669
iter = 10, loss = 0.4218106071208715, RMSE_train = 0.9182972214702385
iter = 15, loss = 0.41752582521894366, RMSE_train = 0.9136161421828999
iter = 20, loss = 0.41619238991823415, RMSE_train = 0.9121473590945683
iter = 25, loss = 0.41558755852361134, RMSE_train = 0.9114776143270195
iter = 30, loss = 0.4152398220063709, RMSE_train = 0.9110924673617044
Matrix Factorization RMSE = 0.9625437813430984
