In [None]:
# Loc cong tac san pham va loc cong tac nguoi dung

In [10]:
# Loc cong tac nguoi dung, su dung class uuCF
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

class uuCF(object):
    def __init__(self, Y_data, k, sim_function = cosine_similarity):
        self.Y_data = Y_data # a 2d array of shape (n_users, 3), each row of Y_data has form [user_id, item_id, rating]
        self.k = k # number of neighborhood
        # similarity function default: cosine_similarity
        self.sim_func = sim_function
        self.Ybar = None # normalize data
        # number of users
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1
        # number of items
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
    
    def fit(self):
        # normalized Y_data -> Ybar
        users = self.Y_data[:, 0] # all users, first column of Y_data
        self.Ybar = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))

        for n in range(self.n_users):
            # row indices of ratings made by user n
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all items rated by user n
            item_ids = self.Y_data[ids, 1]
            # ratings made by user n
            ratings = self.Y_data[ids, 2]
            # avoid zero division
            self.mu[n] = np.mean(ratings) if ids.size > 0 else 0
            self.Ybar[ids, 2] = ratings - self.mu[n]
        
        # form the rating matrix as a sparse matrix.
        # see more: https://goo.gl/i2mmT2
        self.Ybar = sparse.coo_matrix((self.Ybar[:, 2], (self.Ybar[:, 1], self.Ybar[:, 0])), (self.n_items, self.n_users)).tocsr()
        self.S = self.sim_func(self.Ybar.T, self.Ybar.T) # similarity matrix between users

    def pred(self, u, i):
        # predict the rating of user u for item i
        # find item i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # all user who rated i
        users_rated_i = self.Y_data[ids, 0].astype(np.int32)
        sim = self.S[u, users_rated_i] # sim. of u and those users

        nns = np.argsort(sim)[-self.k:] # most k similar users
        nearest_s = sim[nns] # and the corresponding similarities
        r = self.Ybar[i, users_rated_i[nns]] # the corresponding ratings
        eps = 1e-8 # a small number to avoid zero division
        return (r * nearest_s).sum() / (np.abs(nearest_s).sum() + eps) + self.mu[u]


In [11]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols)
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols)

rate_train = np.array(ratings_base)
rate_test = np.array(ratings_test)

rate_train[:, :2] -= 1 # since index start from 0
rate_test[:, :2] -= 1 # since index start from 0

rs = uuCF(rate_train, k = 40)
rs.fit()

n_test = rate_test.shape[0]
SE = 0
for n in range(n_test):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1])
    SE += (pred - rate_test[n, 2]) ** 2

RMSE = np.sqrt(SE / n_test)
print("User - User CF, RMSE = ", RMSE)

User - User CF, RMSE =  0.9766140289287265


In [12]:
# Loc cong tac san pham, transpose ma tran thua la duoc
rate_train = rate_train[:, [1, 0, 2]]
rate_test = rate_test[:, [1, 0, 2]]

rs = uuCF(rate_train, k = 40)
rs.fit()

n_test = rate_test.shape[0]
SE = 0
for n in range(n_test):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1])
    SE += (pred - rate_test[n, 2]) ** 2

RMSE = np.sqrt(SE / n_test)
print("Item - Item CF, RMSE = ", RMSE)

Item - Item CF, RMSE =  0.9688460838682366


In [7]:
a = np.array([1, '222', 3, '44'])
a = np.array([3, 2, 1, 4])
# print(a.astype(np.int32))
print(np.argsort(a))

[2 1 0 3]
