# Neighborhood Based Recommender Systems
Here we implement a user-based recommender system on the MovieLens 100k Dataset.

In [1]:
# Importing packages
import numpy as np
import math
from scipy import sparse

## Movie Lens 1M
Here we work with the Movie Lens 1M data set, because the 20M one is a bit too big.

In [2]:
# setting up some constants
K_FOLD = 5

# numbers of users and items

ITEMS = 3952
USERS = 6040

In [3]:
# from random import randint

# # This function reads data from file and writes it to 
# def read_data(file, target):
#     with open(file) as f:
#         line = f.readline()
#         while line:
#             try:
#                 user, item, rating, _ = line.split(sep='::')
#                 target[randint(0,4)][int(user)-1, int(item)-1] = int(rating)
#             except ValueError:
#                 print(line)
#             line = f.readline()

# # Here we read in the ratings in ML-1M and store it in data_1m.npz

# data_1m = [sparse.lil_matrix((USERS, ITEMS), dtype=np.int8) for _ in range(K_FOLD)]

# read_data(".\\ml-1m\\ratings.dat", data_1m)

# for i in range(K_FOLD):
#     sparse.save_npz('data_1m_' + str(i) + ".npz", data_1m[i].tocsr())

In [4]:
# here we load the presplit datasets
data_1m = [sparse.csr_matrix((USERS, ITEMS), dtype=np.int8) for _ in range(K_FOLD)]

for i in range(K_FOLD):
    data_1m[i] = sparse.load_npz("data_1m_" + str(i) + ".npz")

In [5]:
# split the data into training and testing sets
def get_training_set(i):
    return sum([data_1m[k] for k in range(K_FOLD) if k != i])

train_data_0 = get_training_set(0)
test_data_0 = data_1m[0]

In [6]:
# Generate the average rating for each user

averages = [user.sum() / user.count_nonzero() for user in train_data_0]

In [7]:
def rated_items(user):
    return set(train_data_0[user].nonzero()[1])

In [25]:
import heapq

pearson_matrix = np.empty((USERS, USERS), dtype=np.float64)


pred_val = sparse.lil_matrix((USERS,ITEMS))
for user1 in range(USERS):
    print(user1)
    rated_items_1 = rated_items(user1)

    # This implements the Pearson metric as found in equation (2.2)
    def pearson(user1, user2, rated_items_1):
        
        if pearson_matrix[user1, user2] != 0.0:
            return pearson_matrix[user1, user2]

        if user1 == user2:
            pearson_matrix[user1, user2] = 1
            pearson_matrix[user2, user1] = 1
            return 1

        intersection = list(rated_items_1.intersection(rated_items(user2)))

        if len(intersection) <= 10:
            pearson_matrix[user1, user2] = -1
            pearson_matrix[user2, user1] = -1
            return -1

        user1_ratings = train_data_0.getrow(user1).toarray()[0][intersection]
        user2_ratings = train_data_0.getrow(user2).toarray()[0][intersection]
        ones = np.ones_like(user1_ratings)
        user1_ratings = user1_ratings - ones * averages[user1]
        user2_ratings = user2_ratings - ones * averages[user2]
        bottom1 = math.sqrt(np.sum(np.power(user1_ratings, 2)))
        bottom2 = math.sqrt(np.sum(np.power(user2_ratings, 2)))

        bottom1 = math.sqrt( sum([(train_data_0[user1,k] - averages[user1]) ** 2 for k in intersection]))
        bottom2 = math.sqrt( sum([(train_data_0[user2,k] - averages[user2]) ** 2 for k in intersection]))

        if bottom1 * bottom2 == 0:
            pearson_matrix[user1, user2] = -1
            pearson_matrix[user2, user1] = -1

            return -1

        top = np.dot(user1_ratings, user2_ratings)
        pearson_val = top / (bottom1 * bottom2)
        pearson_matrix[user1, user2] = pearson_val
        pearson_matrix[user2, user1] = pearson_val
        return pearson_val

    for item in test_data_0[user1].nonzero()[1]:
        rated_users = sparse.find(train_data_0.tocsc().getcol(item))[0]
        # K_closest_users = sorted((user2 for user2 in rated_users), key=lambda x: pearson(x))[-5:]
        
        pearson_user = []

        for user2 in rated_users:
            pcc = pearson(user1, user2, rated_items_1)
            if pcc >= 0:
                
                heapq.heappush(pearson_user, (pcc , user2))

        K_closest_users = heapq.nlargest(5, pearson_user)
        
        if len(K_closest_users) == 0:
            pred_val[user1, item] = averages[user1]
            continue

        top = np.sum(pear * (train_data_0[user2,item] - averages[user2]) for pear, user2 in K_closest_users)
        bot = np.sum(abs(pear) for pear, _ in K_closest_users)

        r = averages[user1] + top/bot

        if r > 5:
            r = 5
        elif r < 0:
            r = 0

        pred_val[user1,item] = r
print(pred_val)
        

0


  top = np.sum(pear * (train_data_0[user2,item] - averages[user2]) for pear, user2 in K_closest_users)
  bot = np.sum(abs(pear) for pear, _ in K_closest_users)


1
2
3
4
5
6
7
8
9
  (0, 0)	5.062764778825982
  (0, 149)	4.635955450706121
  (0, 587)	4.799661605317113
  (0, 719)	5.366736538411148
  (0, 782)	4.3394999604319064
  (0, 1196)	5.114346377065084
  (0, 1544)	4.374111692461498
  (0, 2017)	4.977497631764379
  (0, 2027)	4.757058659655483
  (0, 2339)	3.5864991351820166
  (0, 2686)	4.280012320510714
  (0, 2691)	4.899545744228838
  (0, 2790)	4.557414504960032
  (0, 2796)	4.456814464281303
  (0, 2917)	4.912379717923426
  (0, 3104)	4.12369223630969
  (1, 291)	3.4694555509251535
  (1, 355)	4.602268131241613
  (1, 1197)	4.673378433003522
  (1, 1384)	3.6487405927907894
  (1, 1551)	4.151628357112327
  (1, 1596)	4.029945929882513
  (1, 1686)	3.2938342026542573
  (1, 1791)	3.6184083292459017
  (1, 1833)	3.8690886303154324
  (1, 1944)	3.727427763374916
  (1, 2005)	4.432002780120445
  (1, 2027)	4.262977685648167
  (1, 2267)	4.033073207776188
  (1, 2395)	4.061608659538027
  (1, 2489)	3.9358677725780815
  (1, 2716)	3.203605942810455
  (1, 2915)	3.7740626588

In [None]:
# # Here we read in the ratings in u1.test
# u1_test_data = np.zeros((USERS,ITEMS))
# read_data(".\\ml-100k\\u1.test", u1_test_data)

In [None]:
# RMSE as found in equation 7.5
# Calculated using the average ratings of each user as a baseline

squared_error_avg = 0
count = 0

for user in range(USERS):
    for item in range(ITEMS):
        if u1_train_data[user][item] == 0:
            continue
        squared_error_avg += (averages[user] - u1_test_data[user][item]) ** 2
        count += 1

RMSE_avg = math.sqrt(squared_error_avg / count)
print(RMSE_avg)

In [None]:
from itertools import combinations

def user_kendall_coef(user):
    credit = 0
    test_items = {k for k in range(ITEMS) if u1_test_data[user][k] > 0}

    if len(test_items) <= 1:
        raise ZeroDivisionError

    for item1, item2 in combinations(test_items, 2):
        val = (u1_test_data[user][item1] - u1_test_data[user][item2]) * (u1_pred[user][item1] - u1_pred[user][item2])
        if val > 0:
            credit += 1
        elif val < 0:
            credit -= 1
    
    return credit / (len(test_items) * (len(test_items) - 1) / 2)

In [None]:
# Iterate through a range of neighborhood sizes
for K in range(2,21):
    # Predict ratings for each user-item pair in the test data.
    testing_size = 0
    error_count = 0
    u1_pred = np.zeros((USERS,ITEMS))
    for user in range(USERS):
        for item in range(ITEMS):
            if u1_test_data[user][item] == 0:
                continue
            try:
                pred_val = r_hat(user, item, K)
                u1_pred[user][item] = pred_val
                testing_size += 1
            except ZeroDivisionError:
                continue
            
    count = 0
    squared_error = 0

    # calculate the RMSE for each neighborhood size
    for user in range(USERS):
        for item in range(ITEMS):
            if u1_pred[user][item] == 0:
                continue
            squared_error += (u1_pred[user][item] - u1_test_data[user][item]) ** 2
            count += 1
    RMSE = math.sqrt(squared_error / count)

    # calculate the average Kendall rank correlation coefficient for each neighborhood size
    count = 0
    total_ken = 0
    for user in range(USERS):
        try:
            total_ken += user_kendall_coef(user)
            count += 1
        except ZeroDivisionError:
            continue

    print(K, RMSE, total_ken / count)