In [None]:
class NBCF(object):
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = Y_data.UserID.nunique()
        self.n_items = Y_data.MovieID.nunique()
        
        Y_data = Y_data.as_matrix()
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None

    
    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)
    
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1] 
            # and the corresponding ratings 
            ratings = self.Y_data[ids, 2]
            print(ratings)
            # take mean
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
        
    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity() 
        
    def fit(self):
        self.refresh()
        
    
    def __pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2: 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:] 
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
    def pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)
            
    
    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 
    
    def recommend2(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
    
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 

    def print_recommendation(self):
        """
        print all items which should be recommended for each user 
        """
        print('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print('    Recommend item(s):', recommended_items, 'for user', u)
            else: 
                print('    Recommend item', u, 'for user(s) : ', recommended_items)

In [227]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

In [228]:
class NBCF(object):
    def __init__(self, data,user_col,item_col,rating_col, k_nearest_neighbor = 5, dist_func=cosine_similarity):
        self.data = data
        self.user_col = user_col
        self.item_col = item_col
        self.rating_col = rating_col
        
        self.users = self.data[user_col].unique()
        self.items = self.data[item_col].unique()
        self.user_to_idx = dict(zip(list(self.users),list(range(self.users.size))))
        self.item_to_idx = dict(zip(list(self.items),list(range(self.items.size))))
        
        self.k_nearest_neighbor = k_nearest_neighbor
        self.dist_func = dist_func
        
    def normalize(self):
        self.data_norm = self.data.copy()
        self.mu = {}
        for user in self.users:
            user_mean = self.data[self.data[self.user_col] == user][self.rating_col].mean()
            self.mu[user] = user_mean
            self.data_norm.loc[self.data[self.user_col] == user, self.rating_col] -= user_mean
        
        self.matrix = sparse.coo_matrix((self.data_norm[self.rating_col],
            (self.data_norm[self.item_col].apply(lambda x: self.item_to_idx[x]), 
             self.data_norm[self.user_col].apply(lambda x: self.user_to_idx[x]))), 
            (self.items.size, self.users.size))
        
        self.matrix = self.matrix.tocsr()
        
    def similarity(self):
        self.S = self.dist_func(self.matrix.T, self.matrix.T)     
        
    def fit(self):
        self.normalize()
        self.similarity()

    def predict_rating(self, userID, itemID):
#         print(userID,itemID)
        rating_for_item = self.data[self.data[self.item_col] == itemID]
#         print(rating_for_item)
        user_rated_item_idx = np.fromiter(map(self.user_to_idx.__getitem__, rating_for_item[self.user_col]),dtype=np.int)
#         print(user_rated_item_idx)
#         print(self.S.size)
#         print(self.user_to_idx[userID],user_rated_item_idx)
        similarity_with_user = self.S[self.user_to_idx[userID],user_rated_item_idx]
        nearest_users_idx = np.argsort(similarity_with_user)[-self.k_nearest_neighbor:] 
        nearest_distances = similarity_with_user[nearest_users_idx]
        nearest_rating = self.matrix[1, user_rated_item_idx[nearest_users_idx]]
#         print(nearest_rating.size,nearest_distances.size)
#         print(nearest_rating)
#         print(nearest_distances)
#         print(similarity_with_user)
#         print(nearest_rating*nearest_distances)
#         print(np.abs(nearest_distances).sum())

        return (nearest_rating*nearest_distances)[0]/(np.abs(nearest_distances).sum() + 1e-8) + self.mu[userID]

In [219]:
# movielens
train_data = pd.read_csv('ml-100k/ub.base',sep='\t',header=None)

rs = NBCF(train_data,user_col=0,item_col=1,rating_col=2,k_nearest_neighbor=30)
rs.fit()

rate_test = pd.read_csv('ml-100k/ub.test',sep='\t',header=None)
n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
#     print(rate_test.iloc[n, 0], rate_test.iloc[n, 1])
    pred = rs.predict_rating(rate_test.iloc[n, 0], rate_test.iloc[n, 1])
#     print(pred,rate_test.iloc[n, 2])
    SE += (pred - rate_test.iloc[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print('User-user CF, RMSE =', RMSE)

User-user CF, RMSE = 1.0691429261084866


In [229]:
# data file 
data = pd.read_csv('crawl-ratings/train.csv')

rs = NBCF(data,user_col='UserID',item_col='MovieID',rating_col='Rating',k_nearest_neighbor=30)
rs.fit()


In [236]:
test_data=pd.read_csv('crawl-ratings/test.csv')
output_file = open('crawl-ratings/predict.csv','w')
output_file.write('UserID,MovieID,Rating\n')
SE = 0
for i, row in test_data.iterrows():
    if i>=1000:
        break
    pred = rs.predict_rating(row['UserID'],row['MovieID'])
    output_file.write('{},{},{}\n'.format(row['UserID'],row['MovieID'],pred))
    print('.',end='')
    SE += (pred - row['Rating'])**2 

RMSE = np.sqrt(SE/1000)#test_data.shape[0])
print('RMSE =', RMSE)
output_file.close()

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................