In [39]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

class MF(object):
    """docstring for CF"""
    def __init__(self, Y_data, K, lam = 0.1, Xinit = None, Winit = None, 
                 learning_rate = 0.5, max_iter = 1000, print_every = 100, user_based = 1):
        self.Y_raw_data = Y_data
        self.K = K
        self.lam = lam
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.print_every = print_every
        self.user_based = user_based
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(Y_data[:, 0])) + 1 
        self.n_items = int(np.max(Y_data[:, 1])) + 1
        
        if Xinit is None: 
            self.X = np.random.randn(self.n_items, K)
        else:
            self.X = Xinit 
        
        if Winit is None: 
            self.W = np.random.randn(K, self.n_users)
        else: 
            self.W = Winit
            
        #self.all_users = self.Y_data[:,0] # all users (may be duplicated)
        self.n_ratings = Y_data.shape[0]
        # normalized data
        self.Y_data_n = self.Y_raw_data.copy()

    def normalize_Y(self):
        if self.user_based:
            user_col = 0
            item_col = 1
            n_objects = self.n_users
        else:
            user_col = 1
            item_col = 0 
            n_objects = self.n_items

        users = self.Y_raw_data[:, user_col] 
        self.mu = np.zeros((n_objects,))
        for n in range(n_objects):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data_n[ids, item_col] 
            # and the corresponding ratings 
            ratings = self.Y_data_n[ids, 2]
            # take mean
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Y_data_n[ids, 2] = ratings - self.mu[n]
            
    
    def loss(self):
        L = 0 
        for i in range(self.Y_data_n.shape[0]):
            # user, item, rating
            n, m, rate = int(self.Y_data_n[i, 0]), int(self.Y_data_n[i, 1]), self.Y_data_n[i, 2]
            L += 0.5*(rate - self.X[m, :].dot(self.W[:, n]))**2
            
        # regularization, don't ever forget this 
        L /= self.n_ratings
        L += 0.5*self.lam*(np.linalg.norm(self.X, 'fro') + np.linalg.norm(self.W, 'fro'))
        return L 

    
    def get_items_rated_by_user(self, user_id):
        """
        get all items which are rated by user n, and the corresponding ratings
        """
        # y = self.Y_data_n[:,0] # all users (may be duplicated)
        # item indices rated by user_id
        # we need to +1 to user_id since in the rate_matrix, id starts from 1 
        # while index in python starts from 0
        ids = np.where(self.Y_data_n[:,0] == user_id)[0] 
        item_ids = self.Y_data_n[ids, 1].astype(np.int32) # index starts from 0 
        ratings = self.Y_data_n[ids, 2]
        return (item_ids, ratings)
        
        
    def get_users_who_rate_item(self, item_id):
        """
        get all users who rated item m and get the corresponding ratings
        """
        ids = np.where(self.Y_data_n[:,1] == item_id)[0] 
        user_ids = self.Y_data_n[ids, 0].astype(np.int32)
        ratings = self.Y_data_n[ids, 2]
        return (user_ids, ratings)
        
    def updateX(self):
        for m in range(self.n_items):
            user_ids, ratings = self.get_users_who_rate_item(m)
            Wm = self.W[:, user_ids]
            grad_xm = -(ratings - self.X[m, :].dot(Wm)).dot(Wm.T)/self.n_ratings + \
                                               self.lam*self.X[m, :]
            self.X[m, :] -= self.learning_rate*grad_xm.reshape((self.K,))
    
    def updateW(self):
        for n in range(self.n_users):
            item_ids, ratings = self.get_items_rated_by_user(n)
            Xn = self.X[item_ids, :]
            grad_wn = -Xn.T.dot(ratings - Xn.dot(self.W[:, n]))/self.n_ratings + \
                        self.lam*self.W[:, n]
            self.W[:, n] -= self.learning_rate*grad_wn.reshape((self.K,))
    
    def fit(self):
        self.normalize_Y()
        for it in range(self.max_iter):
            self.updateX()
            self.updateW()
            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y_raw_data)
                print ('iter =', it + 1, ', loss =', self.loss(), ', RMSE train =', rmse_train)
    
    
    def pred(self, u, i):
        """ 
        predict the rating of user u for item i 
        if you need the un
        """
        u = int(u)
        i = int(i)
        
        if self.user_based:
            bias = self.mu[u]
        else: 
            bias = self.mu[i]
        pred = self.X[i, :].dot(self.W[:, u]) + bias 
        if pred < 1:
            return 1 
        if pred > 5: 
            return 5 
        return pred 
        
    
    def pred_for_user(self, user_id):
        ids = np.where(self.Y_data_n[:, 0] == user_id)[0]
        items_rated_by_u = self.Y_data_n[ids, 1].tolist()              
        
        y_pred = self.X.dot(self.W[:, user_id]) + self.mu[user_id]
        predicted_ratings= []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                predicted_ratings.append((i, y_pred[i]))
        
        return predicted_ratings
    
    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0]
        SE = 0 # squared error
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)
        return RMSE


In [40]:
r_cols = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('ex.dat', sep = ' ', names = r_cols, encoding='latin-1')
Y_data = ratings.values


rs = MF(Y_data, K = 2, max_iter = 1000, print_every = 1000)

rs.fit()
rs.pred(6, 1)

iter = 1000 , loss = 0.5084833751056823 , RMSE train = 0.7957886494987098


2.9380199581998134

In [41]:
print (rs.X.dot(rs.W) + rs.mu)

[[4.16625818 4.01680153 1.80733739 0.22160207 1.71830608 1.43294427
  2.37163298]
 [3.65912294 3.28653499 2.21606685 0.81564465 2.14423641 1.47946916
  2.93801996]
 [3.99851773 3.85485551 1.87321182 0.47628506 1.87757691 1.42260168
  2.46573162]
 [2.44219242 1.70055914 3.05196581 2.36277004 3.20474678 1.53732458
  4.10223043]
 [2.25996826 1.32034087 3.30143256 2.49007993 3.3305722  1.59212867
  4.44375663]]


In [42]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('D:/ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('D:/ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.values
rate_test = ratings_test.values

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1
ratings_base.values

array([[        0,         0,         5, 874965758],
       [        0,         1,         3, 876893171],
       [        0,         2,         4, 878542960],
       ...,
       [      942,      1187,         3, 888640250],
       [      942,      1227,         3, 888640275],
       [      942,      1329,         3, 888692465]], dtype=int64)

In [43]:
rs = MF(rate_train, K = 10, lam = .1, print_every = 10, learning_rate = 0.75, max_iter = 100, user_based = 1)
rs.fit()
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print ('\nUser-based MF, RMSE =', RMSE)

iter = 10 , loss = 5.678338783654116 , RMSE train = 1.212792770453722
iter = 20 , loss = 2.646854861555364 , RMSE train = 1.0384051255739977
iter = 30 , loss = 1.3465890835212526 , RMSE train = 1.0295650061152724
iter = 40 , loss = 0.7543184048132169 , RMSE train = 1.0292161825763546
iter = 50 , loss = 0.4828935044551411 , RMSE train = 1.0292111959055392
iter = 60 , loss = 0.35843014621819436 , RMSE train = 1.0292134431493178
iter = 70 , loss = 0.30135340167862323 , RMSE train = 1.0292141188849417
iter = 80 , loss = 0.2751788640016341 , RMSE train = 1.0292142820481764
iter = 90 , loss = 0.2631756154616862 , RMSE train = 1.0292143198322112
iter = 100 , loss = 0.2576711075362805 , RMSE train = 1.0292143284659525

User-based MF, RMSE = 1.0603798992982894


In [11]:
rs = MF(rate_train, K = 10, lam = .1, print_every = 10, learning_rate = 0.75, max_iter = 100, user_based = 0)
rs.fit()
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print ('\nItem-based MF, RMSE =', RMSE)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


iter = 10 , loss = 5.613402106219798 , RMSE train = 1.174252401976531
iter = 20 , loss = 2.6143182335726287 , RMSE train = 1.0051282632946323
iter = 30 , loss = 1.322786985815 , RMSE train = 0.9965371086746744
iter = 40 , loss = 0.7342360324082022 , RMSE train = 0.9961865150366449
iter = 50 , loss = 0.46450083159955025 , RMSE train = 0.9961786848963439
iter = 60 , loss = 0.34081089175399826 , RMSE train = 0.9961802165307989
iter = 70 , loss = 0.28408858253732777 , RMSE train = 0.9961807347533593
iter = 80 , loss = 0.25807652258224617 , RMSE train = 0.9961808640940073
iter = 90 , loss = 0.24614776461616436 , RMSE train = 0.9961808946521361
iter = 100 , loss = 0.24067740989160252 , RMSE train = 0.9961809017438302

Item-based MF, RMSE = 1.0486424485398262


In [12]:
rs = MF(rate_train, K = 2, lam = 0, print_every = 10, learning_rate = 1, max_iter = 100, user_based = 0)
rs.fit()
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print ('\nItem-based MF, RMSE =', RMSE)

iter = 10 , loss = 1.1937583545413826 , RMSE train = 1.4438476704505294
iter = 20 , loss = 1.1243413104876814 , RMSE train = 1.42685339567949
iter = 30 , loss = 1.0628436068276423 , RMSE train = 1.410700677241748
iter = 40 , loss = 1.008063866699486 , RMSE train = 1.3953791994986933
iter = 50 , loss = 0.9590244283909591 , RMSE train = 1.3808603149680798
iter = 60 , loss = 0.9149225478141115 , RMSE train = 1.3670836748371777
iter = 70 , loss = 0.8750936902904786 , RMSE train = 1.3540111793342933
iter = 80 , loss = 0.8389835832709639 , RMSE train = 1.3415951598800193
iter = 90 , loss = 0.8061267035330398 , RMSE train = 1.3297920830075303
iter = 100 , loss = 0.7761295486681602 , RMSE train = 1.3185599523057439

Item-based MF, RMSE = 1.3690367205520346


In [13]:
RMSE = rs.evaluate_RMSE(rate_train)

In [15]:
print (RMSE)

1.3185599523057439


In [16]:
rs.pred_for_user(1)

[(1, 2.4201974036019442),
 (2, 3.5213768181939935),
 (3, 4.594226476961963),
 (4, 2.7104819257589643),
 (5, 1.8568428734857807),
 (6, 4.4177526208839115),
 (7, 2.3096846147563967),
 (8, 3.5997416353632685),
 (10, 2.7849288014119837),
 (11, 3.9434692557546107),
 (14, 2.9573497190969724),
 (15, 1.3079264272308921),
 (16, 3.4019991903144735),
 (17, 5.461201707669454),
 (19, 2.7410923062172894),
 (20, 1.7656827668222934),
 (21, 2.56663285381765),
 (22, 2.5103773819863613),
 (23, 4.238143223700167),
 (25, 2.8267373645088036),
 (26, 4.610413913398826),
 (27, 2.7382674208587026),
 (28, 3.155798943080894),
 (29, 3.1409855178115724),
 (30, 3.114613897558675),
 (31, 2.330126391950703),
 (32, 2.7727694253137525),
 (33, 4.783431761143771),
 (34, 2.1806504162729095),
 (35, 2.226382140732512),
 (36, 3.4235620791867056),
 (37, 2.7571195411675617),
 (38, 5.596913056286063),
 (39, 2.8820605599247267),
 (40, 4.647013629016586),
 (41, 3.7415706225673455),
 (42, -0.0844844048172737),
 (43, 3.2948096960663

In [36]:
data = rs.pred_for_user(1)
sorted_data = sorted(data, key=lambda x: x[1], reverse=True)
total_movies = len(sorted_data)
percentile = int(0.02 * total_movies)
top_movies = sorted_data[:percentile]
top_movie_ids = [movie[0] for movie in sorted_data[:percentile]]
print(top_movie_ids)

[1526, 1110, 893, 1496, 1624, 536, 1529, 788, 1303, 1542, 1306, 89, 1298, 1412, 1531, 437, 1335, 1629, 1509, 1530, 753, 354, 534, 1072, 1395, 343, 1581, 1611, 588, 1156, 73, 970]


In [37]:
def read_movie_names(file_path):
    movie_names = {}
    with open(file_path, 'r', encoding='iso-8859-1') as file:
        for line in file:
            parts = line.strip().split('|')
            movie_id = int(parts[0])
            movie_name = parts[1]
            movie_names[movie_id] = movie_name
    return movie_names
movie_names_dict = read_movie_names("D:/ml-100k/u.item")
top_movie_names = [movie_names_dict[movie_id] for movie_id in top_movie_ids]

df = pd.DataFrame({'ID': top_movie_ids, 'Tên phim': top_movie_names})

print(df)

      ID                                           Tên phim
0   1526                                     Witness (1985)
1   1110                                   Tank Girl (1995)
2    893                        For Richer or Poorer (1997)
3   1496                                     Carpool (1996)
4   1624                                        Hush (1998)
5    536                                     Ponette (1996)
6   1529                                 Underground (1995)
7    788                               Relative Fear (1994)
8   1303                                Getaway, The (1994)
9   1542                         Scarlet Letter, The (1926)
10  1306                              Delta of Venus (1994)
11    89                                Blade Runner (1982)
12  1298                             Band Wagon, The (1953)
13  1412  Land Before Time III: The Time of the Great Gi...
14  1531  Far From Home: The Adventures of Yellow Dog (1...
15   437            Amityville 1992: It'