In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

In [2]:
class NBCF(object):
    def __init__(self, data,user_col,item_col,rating_col, k_nearest_neighbor = 5, dist_func=cosine_similarity):
        self.data = data
        self.user_col = user_col
        self.item_col = item_col
        self.rating_col = rating_col
        
        self.users = self.data[user_col].unique()
        self.items = self.data[item_col].unique()
        self.user_to_idx = dict(zip(list(self.users),list(range(self.users.size))))
        self.item_to_idx = dict(zip(list(self.items),list(range(self.items.size))))
        
        self.k_nearest_neighbor = k_nearest_neighbor
        self.dist_func = dist_func
        
    def normalize(self):
        self.data_norm = self.data.copy()
        self.mu = {}
        for user in self.users:
            user_mean = self.data[self.data[self.user_col] == user][self.rating_col].mean()
            self.mu[user] = user_mean
            self.data_norm.loc[self.data[self.user_col] == user, self.rating_col] -= user_mean
        
        self.matrix = sparse.coo_matrix((self.data_norm[self.rating_col],
            (self.data_norm[self.item_col].apply(lambda x: self.item_to_idx[x]), 
             self.data_norm[self.user_col].apply(lambda x: self.user_to_idx[x]))), 
            (self.items.size, self.users.size))
        
        self.matrix = self.matrix.tocsr()
        
    def similarity(self):
        self.S = self.dist_func(self.matrix.T, self.matrix.T)     
        
    def fit(self):
        self.normalize()
        self.similarity()

    def predict_rating(self, userID, itemID):
#         print(userID,itemID)
        rating_for_item = self.data[self.data[self.item_col] == itemID]
#         print(rating_for_item)
        user_rated_item_idx = np.fromiter(map(self.user_to_idx.__getitem__, rating_for_item[self.user_col]),dtype=np.int)
        similarity_with_user = self.S[self.user_to_idx[userID],user_rated_item_idx]
        nearest_users_idx = np.argsort(similarity_with_user)[-self.k_nearest_neighbor:] 
        nearest_distances = similarity_with_user[nearest_users_idx]
        nearest_rating = self.matrix[1, user_rated_item_idx[nearest_users_idx]]

        return (nearest_rating*nearest_distances)[0]/(np.abs(nearest_distances).sum() + 1e-8) + self.mu[userID]

In [3]:
# test movielens
train_data = pd.read_csv('ml-100k/ub.base',sep='\t',header=None)

rs = NBCF(train_data,user_col=0,item_col=1,rating_col=2,k_nearest_neighbor=30)
rs.fit()

rate_test = pd.read_csv('ml-100k/ub.test',sep='\t',header=None)
n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
#     print(rate_test.iloc[n, 0], rate_test.iloc[n, 1])
    pred = rs.predict_rating(rate_test.iloc[n, 0], rate_test.iloc[n, 1])
#     print(pred,rate_test.iloc[n, 2])
    SE += (pred - rate_test.iloc[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print('RMSE =', RMSE)

RMSE = 1.0691429261084866


In [5]:
# test our data
data = pd.read_csv('crawl-ratings/train.csv')

rs = NBCF(data,user_col='UserID',item_col='MovieID',rating_col='Rating',k_nearest_neighbor=30)
rs.fit()

In [6]:
test_data=pd.read_csv('crawl-ratings/test.csv')
output_file = open('crawl-ratings/predict.csv','w')
output_file.write('UserID,MovieID,Rating\n')
SE = 0
for i, row in test_data.iterrows():
    pred = rs.predict_rating(row['UserID'],row['MovieID'])
    output_file.write('{},{},{}\n'.format(row['UserID'],row['MovieID'],pred))
    SE += (pred - row['Rating'])**2 

RMSE = np.sqrt(SE/test_data.shape[0])
print('RMSE =', RMSE)
output_file.close()

RMSE = 2.697235202403294
