In [1]:
import pandas as pd
import numpy as np
import random
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
import scipy.optimize
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols)
n_users = users.shape[0]
print('Number of users:', n_users)
#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('ua.base', sep='\t', names=r_cols)
ratings_test = pd.read_csv('ua.test', sep='\t', names=r_cols)
rate_train = ratings_base.values

Number of users: 943


In [3]:
def Myrecommend():
    def normalizeRatings(myY, myR):
        # Tính trung bình các đánh giá
        Ymean = np.sum(myY, axis=1) / np.sum(myR, axis=1)
        Ymean = Ymean.reshape((Ymean.shape[0], -1))
        return myY - Ymean, Ymean

    def flattenParams(myX, myTheta):
        return np.concatenate((myX.flatten(), myTheta.flatten()))

    def reshapeParams(flattened_XandTheta, mynm, mynu, mynf):
        assert flattened_XandTheta.shape[0] == int(mynm * mynf + mynu * mynf)
        reX = flattened_XandTheta[: int(mynm * mynf)].reshape((mynm, mynf))
        reTheta = flattened_XandTheta[int(mynm * mynf) :].reshape((mynu, mynf))
        return reX, reTheta

    def cofiCostFunc(myparams, myY, myR, mynu, mynm, mynf, mylambda=0.0):
        myX, myTheta = reshapeParams(myparams, mynm, mynu, mynf)
        term1 = myX.dot(myTheta.T)
        term1 = np.multiply(term1, myR)
        cost = 0.5 * np.sum(np.square(term1 - myY))
        # Thêm phần regularization
        cost += (mylambda / 2.0) * np.sum(np.square(myTheta))
        cost += (mylambda / 2.0) * np.sum(np.square(myX))
        return cost

    def cofiGrad(myparams, myY, myR, mynu, mynm, mynf, mylambda=0.0):
        myX, myTheta = reshapeParams(myparams, mynm, mynu, mynf)
        term1 = myX.dot(myTheta.T)
        term1 = np.multiply(term1, myR)
        term1 -= myY
        Xgrad = term1.dot(myTheta)
        Thetagrad = term1.T.dot(myX)
        # Thêm phần regularization
        Xgrad += mylambda * myX
        Thetagrad += mylambda * myTheta
        return flattenParams(Xgrad, Thetagrad)

    
    mynu = df_train.user_name.nunique()
    mynm = df_train.product_id.nunique()
    mynf = 10
    Y = np.zeros((mynm, mynu))

    user_to_column = {user_name: idx for idx, user_name in enumerate(df['user_name'].unique())}
    product_to_row = {product: idx for idx, product in enumerate(df['product_id'].unique())}

    for row in df.itertuples():
        Y[product_to_row[row.product_id], user_to_column[row.user_name]] = row.rating

    R = (Y != 0).astype(int)

    Ynorm, Ymean = normalizeRatings(Y, R)
    X = np.random.rand(mynm, mynf)
    Theta = np.random.rand(mynu, mynf)
    myflat = flattenParams(X, Theta)

    result = scipy.optimize.minimize(
        fun=cofiCostFunc,
        x0=myflat,
        args=(Ynorm, R, mynu, mynm, mynf, 3),
        method="TNC",
        jac=cofiGrad,
        options={"maxiter": 300},
    )

    resX, resTheta = reshapeParams(result.x, mynm, mynu, mynf)
    prediction_matrix = resX.dot(resTheta.T)
    return prediction_matrix, Ymean, product_to_row, user_to_column

In [4]:
class uuCF(object):
    def __init__(self, Y_data, k, sim_func=cosine_similarity):
        self.Y_data = np.array(Y_data)
        self.k = k
        self.sim_func = sim_func
        self.Ybar = None
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1

    def fit(self):
        users = self.Y_data[:, 0]
        self.Ybar = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            ids = np.where(users == n)[0].astype(np.int32)
            ratings = self.Y_data[ids, 2]
            self.mu[n] = np.mean(ratings) if ids.size > 0 else 0
            self.Ybar[ids, 2] = ratings - self.mu[n]
        self.Ybar = sparse.coo_matrix(
            (self.Ybar[:, 2], (self.Ybar[:, 1], self.Ybar[:, 0])), (self.n_items, self.n_users)
        ).tocsr()
        self.S = self.sim_func(self.Ybar.T, self.Ybar.T)

    def pred(self, u, i):
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        sim = self.S[u, users_rated_i]
        nns = np.argsort(sim)[-self.k :]
        nearest_s = sim[nns]
        r = self.Ybar[i, users_rated_i[nns]]
        eps = 1e-8
        return (r * nearest_s).sum() / (np.abs(nearest_s).sum() + eps) + self.mu[u]

In [5]:
# # Tạo data frame với thông tin người dùng, sản phẩm và đánh giá
# Y_data = df[['user_name', 'product_id', 'rating']].values

# # Chia dữ liệu thành tập huấn luyện và tập kiểm tra
# train_data, test_data = train_test_split(Y_data, test_size=0.2, random_state=42)

# # Tạo DataFrame cho tập huấn luyện
# df_train = pd.DataFrame(train_data, columns=['user_name', 'product_id', 'rating'])

# # Khởi tạo và chạy thử nghiệm cho Myrecommend với tham số tối ưu
# prediction_matrix, Ymean, product_to_row, user_to_column = Myrecommend()

# Tạo Y_pred cho Myrecommend trên tập kiểm tra
# Y_pred_test = np.zeros_like(test_data[:, 2])
# for idx, (user, product_id, rating) in enumerate(test_data):
#     if product_id in product_to_row and user in user_to_column:
#         Y_pred_test[idx] = prediction_matrix[product_to_row[product_id], user_to_column[user]] + Ymean[product_to_row[product_id]]
#     else:
#         Y_pred_test[idx] = np.mean(df_train['rating'])  # Giá trị trung bình nếu sản phẩm hoặc người dùng mới

# # Khởi tạo uuCF và chạy thử nghiệm với k tối ưu
# model = uuCF(train_data, k=10)
# model.fit()

# # Tạo Y_pred cho uuCF trên tập kiểm tra
# Y_pred_uuCF_test = np.zeros_like(test_data[:, 2])
# for idx, (user, product_id, rating) in enumerate(test_data):
#     Y_pred_uuCF_test[idx] = model.pred(user, product_id)

# # Tính sai số cho cả hai phương pháp trên tập kiểm tra
# mae_myrecommend_test = mean_absolute_error(test_data[:, 2], Y_pred_test)
# rmse_myrecommend_test = mean_squared_error(test_data[:, 2], Y_pred_test, squared=False)

# mae_uucf_test = mean_absolute_error(test_data[:, 2], Y_pred_uuCF_test)
# rmse_uucf_test = mean_squared_error(test_data[:, 2], Y_pred_uuCF_test, squared=False)

# print(f"MAE của Myrecommend trên tập kiểm tra: {mae_myrecommend_test}")
# print(f"MSE của Myrecommend trên tập kiểm tra: {rmse_myrecommend_test}")

# print(f"MAE của uuCF trên tập kiểm tra: {mae_uucf_test}")
# print(f"MSE của uuCF trên tập kiểm tra: {rmse_uucf_test}")

In [6]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols)
n_users = users.shape[0]
print('Number of users:', n_users)
#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('ua.base', sep='\t', names=r_cols)
ratings_test = pd.read_csv('ua.test', sep='\t', names=r_cols)
rate_train = ratings_base.values
rate_test = ratings_test.values

Number of users: 943


In [7]:
#rate_train[:, :2] -= 1 # since indices start from 0
#rate_test[:, :2] -= 1
rs = uuCF(rate_train, k = 40)
rs.fit()
SE = 0 # squared error
rate_train = rate_train[:, [1, 0, 2]]
rate_test = rate_test[:, [1, 0, 2]]
rs = uuCF(rate_train, k = 40)
rs.fit()
n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1])
    SE += (pred - rate_test[n, 2])**2
testRMSECF = SE/n_tests
SE=0
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1])
    SE += abs(pred - rate_test[n, 2])
testRMAECF = SE/n_tests
SE=0
n_trains=rate_train.shape[0]
for n in range(n_trains):
    pred = rs.pred(rate_train[n, 0], rate_train[n, 1])
    SE += abs(pred - rate_train[n, 2])
trainRMAECF = SE/n_trains
SE=0
for n in range(n_trains):
    pred = rs.pred(rate_train[n, 0], rate_train[n, 1])
    SE += (pred - rate_train[n, 2])**2
trainRMSECF = SE/n_trains


In [8]:
class MF(object):
    def __init__(self, Y, K, lam = 0.1, Xinit = None, Winit = None,learning_rate = 0.5, max_iter = 1000, print_every = 100):
        self.Y = Y # represents the utility matrix
        self.K = K
        self.lam = lam # regularization parameter
        self.learning_rate = learning_rate # for gradient descent
        self.max_iter = max_iter # maximum number of iterations
        self.print_every = print_every # print loss after each a few iters
        self.n_users = int(np.max(Y[:, 0])) + 1
        self.n_items = int(np.max(Y[:, 1])) + 1
        self.n_ratings = Y.shape[0] # number of known ratings
        self.X = np.random.randn(self.n_items, K) if Xinit is None else Xinit
        self.W = np.random.randn(K, self.n_users) if Winit is None else Winit
        self.b = np.random.randn(self.n_items) # item biases
        self.d = np.random.randn(self.n_users) # user biases]

    def loss(self):
        L = 0
        for i in range(self.n_ratings):
        # user_id, item_id, rating
            n, m, rating = int(self.Y[i,0]), int(self.Y[i,1]), self.Y[i,2]
            L += 0.5*(self.X[m].dot(self.W[:, n])\
            + self.b[m] + self.d[n] - rating)**2
        L /= self.n_ratings
        # regularization, don’t ever forget this
        return L + 0.5*self.lam*(np.sum(self.X**2) + np.sum(self.W**2))
    def updateXb(self):
        for m in range(self.n_items):
            # get all users who rated item m and corresponding ratings
            ids = np.where(self.Y[:, 1] == m)[0] # row indices of items m
            user_ids, ratings=self.Y[ids, 0].astype(np.int32),self.Y[ids, 2]
            Wm, dm = self.W[:, user_ids], self.d[user_ids]
            for i in range(30): # 30 iteration for each sub problem
                xm = self.X[m]
                error = xm.dot(Wm) + self.b[m] + dm - ratings
                grad_xm = error.dot(Wm.T)/self.n_ratings + self.lam*xm
                grad_bm = np.sum(error)/self.n_ratings
                # gradient descent
                self.X[m] -= self.learning_rate*grad_xm.reshape(-1)
                self.b[m] -= self.learning_rate*grad_bm
    def updateWd(self): # and d
        for n in range(self.n_users):
            # get all items rated by user n, and the corresponding ratings
            ids = np.where(self.Y[:,0] == n)[0] #indexes of items rated by n
            item_ids,ratings=self.Y[ids, 1].astype(np.int32), self.Y[ids, 2]
            Xn, bn = self.X[item_ids], self.b[item_ids]
            for i in range(30): # 30 iteration for each sub problem
                wn = self.W[:, n]
                error = Xn.dot(wn) + bn + self.d[n] - ratings
                grad_wn = Xn.T.dot(error)/self.n_ratings + self.lam*wn
                grad_dn = np.sum(error)/self.n_ratings
                # gradient descent
                self.W[:, n] -= self.learning_rate*grad_wn.reshape(-1)
                self.d[n] -= self.learning_rate*grad_dn    

    def fit(self):
        for it in range(self.max_iter):
            self.updateWd()
            self.updateXb()
            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y)
                #print('iter = %d, loss = %.4f, RMSE train = %.4f'%(it + 1,self.loss(), rmse_train))

    def pred(self, u, i):
        """
        predict the rating of user u for item i
        """
        u, i = int(u), int(i)
        pred = self.X[i, :].dot(self.W[:, u]) + self.b[i] + self.d[u]
        return max(0, min(5, pred)) # 5-scale in MoviesLen
    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0] # number of test
        SE = 0 # squared error
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2
        RMSE = np.sqrt(SE/n_tests)
        return RMSE

    def evaluate_RMAE(self, rate_test):
        n_tests = rate_test.shape[0] # number of test
        AE = 0 # absolute error
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            AE += abs(pred - rate_test[n, 2])
        MAE = AE / n_tests
        return MAE
    
            
            

In [9]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('ua.base', sep='\t', names=r_cols)
ratings_test = pd.read_csv('ua.test', sep='\t', names=r_cols)
rate_train = ratings_base.values
rate_test = ratings_test.values
# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1
rs = MF(rate_train, K = 50, lam = .01, print_every = 5, learning_rate = 50,
max_iter = 30)
rs.fit()
# evaluate on test data
testRMSEMF = rs.evaluate_RMSE(rate_test)
testRMAEMF=rs.evaluate_RMAE(rate_test)
trainRMSEMF = rs.evaluate_RMSE(rate_train)
trainRMAEMF=rs.evaluate_RMAE(rate_train)



In [10]:
comparison_table = pd.DataFrame({
    'Model': ['MF', 'CF'],
    'Train RMSE': [trainRMSEMF, trainRMSECF],
    'Test RMSE': [testRMSEMF, testRMSECF],
    'Train MAE': [trainRMAEMF, trainRMAECF],
    'Test MAE': [testRMAEMF, testRMAECF]
})

print(comparison_table)

  Model  Train RMSE  Test RMSE  Train MAE  Test MAE
0    MF    0.911096   0.962395   0.718637  0.756910
1    CF    0.530874   0.938663   0.600066  0.771578
