In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [3]:
class CF(object):
    def __init__(self, Y_data, k, sim_func = cosine_similarity, uuCF = 1):
        self.uuCF       = uuCF # user-user (1) or item-item (0) CF
        # Y_data is a 2d array of shape (n_users, 3), each row of Y_data has form [user_id, item_id, rating]
        self.Y_data     = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k          = k # number of neighborhoods
        self.sim_func   = sim_func # sim function, default cosine similarity
        self.Ybar_data  = None # normalized data
        self.n_users    = int(np.max(self.Y_data[:, 0])) + 1 # number of users
        self.n_items    = int(np.max(self.Y_data[:, 1])) + 1 # number of items
    
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of ratings made by user n
            ids = np.where(users == n)[0].astype(np.int32)
            ratings = self.Y_data[ids, 2]  # ratings made by user n 
            self.mu[n] = np.mean(ratings) if ids.size > 0 else 0 # avoid zero division 
            self.Ybar_data[ids, 2] = (ratings - self.mu[n]).flatten()
        
        ## form the rating matrix as a sparse matrix
        data = np.asarray(self.Ybar_data[:, 2]).flatten()
        row = np.asarray(self.Ybar_data[:, 1]).flatten()
        col = np.asarray(self.Ybar_data[:, 0]).flatten()
        shape = np.asarray((int(self.n_items),int(self.n_users))).flatten()
        self.Ybar = sparse.coo_matrix((data,(row, col)), shape=shape).tocsr()

    def similarity(self):
        self.S = self.sim_func(self.Ybar.T, self.Ybar.T)

    def fit(self):
        self.normalize_Y()
        self.similarity() 
        
    # predict the rating of user u for item i
    def __pred(self, u, i):
        # find item
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # all users who rated i
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # similarity between u and users_rated_i
        sim = self.S[u, users_rated_i].flatten()
        # find k most similarity users
        a = np.argsort(sim)[-self.k:] 
        nearest_s = sim[a] # and the corresponding similarity values
        # the corresponding ratings
        r = self.Ybar[i, users_rated_i[a]]
        return (r*nearest_s).sum()/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
    def pred(self, u, i):
        if self.uuCF: return self.__pred(u, i)
        return self.__pred(i, u)

    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0]
        SE = 0
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2 
        
        RMSE = np.sqrt(SE/n_tests)
        return RMSE

In [4]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = np.asmatrix(ratings_base)
rate_test = np.asmatrix(ratings_test)

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [5]:
rs = CF(rate_train, k = 40, uuCF = 1)
rs.fit()
rs_i = CF(rate_train, k = 40, uuCF = 0)
rs_i.fit()

In [6]:
print('User-user CF, RMSE =', rs.evaluate_RMSE(rate_test))
print('Item-item CF, RMSE =', rs_i.evaluate_RMSE(rate_test))

User-user CF, RMSE = 0.9766140289287265
Item-item CF, RMSE = 0.9688460838682366
