In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [11]:
class CF(object):
    def __init__(self, Ydata, k, dist_func=cosine_similarity, uuCF=1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Ydata = Ydata if uuCF else Ydata[:, [1, 0, 2]]
        self.k = k # number of neighbor points
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Add 1 cause id starts from 0
        self.n_users = int(np.max(self.Ydata[:, 0])) + 1
        self.n_items = int(np.max(self.Ydata[:, 1])) + 1
        
    def add(self, new_data):
        """
        Update Ydata matrix when new ratings come
        """
        self.Ydata = np.concatenate((self.Ydata, new_data), axis=0)

    def normalize_Y(self):
        users = self.Ydata[:, 0]
        self.Ybar_data = self.Ydata.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we nee to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Ydata[ids, 1]
            # and the corresponding ratings
            ratings = self.Ydata[ids, 2]
            # take mean
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2], (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)

    def refresh(self):
        """
        Normalize data and calculate similarity matrix again
        """
        self.normalize_Y()
        self.similarity()

    def fit(self):
        self.refresh()
        
    def __pred(self, u, i, normalized = 1):
        """
        predict the rating of user u for item i (normalized)
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Ydata[:, 1] == i)[0].astype(np.int32)
        # Step 2:
        users_rated_i = (self.Ydata[ids, 0]).astype(np.int32)
        # Step 3: find simularity btw the current user and others who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:]
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]

    def pred(self, u, i, normalized = 1):
        """
        predict the rating of user u for item i (normalized)
        """
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)

    def recommend(self, u, normalized = 1):
        """
        Determine all items should be recommended for user u (uuCF=1)
        or all users who might have interest on item u (uuCF=0)
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which have not been rated by u yet
        """
        ids = np.where(self.Ydata[:, 0] == u)[0]
        items_rated_by_u = self.Ydata[ids, 1].tolist()
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0:
                    recommended_items.append(i)

        return recommended_items

    def print_recommendation(self):
        """
        print all items which should be recommended for each user
        """
        print('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print('Recommend item(s): ', recommended_items, 'to user', u)
            else:
                print('Recommend item', u, 'to user(s): ', recommended_items)

In [9]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

rating_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols)
rating_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols)

rate_train = rating_base.values
rate_test = rating_test.values

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [12]:
rs = CF(rate_train, k=30, uuCF=1)
rs.fit()

n_test = rate_test.shape[0]
SE = 0 # square error
for n in range(n_test):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized=0)
    SE += (pred - rate_test[n, 2])**2
    
RMSE = np.sqrt(SE/n_test)
print("User-user CF, RMSE = ", RMSE)

User-user CF, RMSE =  1.0369740376881258


In [13]:
rs = CF(rate_train, k=30, uuCF=0)
rs.fit()

n_test = rate_test.shape[0]
SE = 0 # square error
for n in range(n_test):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized=0)
    SE += (pred - rate_test[n, 2])**2
    
RMSE = np.sqrt(SE/n_test)
print("Item-item CF, RMSE = ", RMSE)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Item-item CF, RMSE =  1.0177591959372823
