# Neighborhood Based Recommender Systems
Here we implement a user-based recommender system on the MovieLens 100k Dataset.

In [1]:
# Importing packages
import numpy as np
import math
from scipy import sparse

## Movie Lens 1M
Here we work with the Movie Lens 1M data set, because the 20M one is a bit too big.

In [2]:
# setting up some constants
K_FOLD = 5

# numbers of users and items

ITEMS = 3952
USERS = 6040

In [3]:
# from random import randint

# # This function reads data from file and writes it to 
# def read_data(file, target):
#     with open(file) as f:
#         line = f.readline()
#         while line:
#             try:
#                 user, item, rating, _ = line.split(sep='::')
#                 target[randint(0,4)][int(user)-1, int(item)-1] = int(rating)
#             except ValueError:
#                 print(line)
#             line = f.readline()

# # Here we read in the ratings in ML-1M and store it in data_1m.npz

# data_1m = [sparse.lil_matrix((USERS, ITEMS), dtype=np.int8) for _ in range(K_FOLD)]

# read_data(".\\ml-1m\\ratings.dat", data_1m)

# for i in range(K_FOLD):
#     sparse.save_npz('data_1m_' + str(i) + ".npz", data_1m[i].tocsr())

In [4]:
# here we load the presplit datasets
data_1m = [sparse.csr_matrix((USERS, ITEMS), dtype=np.int8) for _ in range(K_FOLD)]

for i in range(K_FOLD):
    data_1m[i] = sparse.load_npz("data_1m_" + str(i) + ".npz")

In [5]:
# split the data into training and testing sets
def get_training_set(i):
    return sum([data_1m[k] for k in range(K_FOLD) if k != i])

train_data_0 = get_training_set(0)
test_data_0 = data_1m[0]

In [6]:
# Generate the average rating for each user

averages = [user.sum() / user.count_nonzero() for user in train_data_0]

In [7]:
def rated_items(user):
    return set(train_data_0[user].nonzero()[1])

In [28]:
import heapq

MAX_NEIGHBORS = 25

pearson_matrix = sparse.lil_matrix((USERS, USERS), dtype=np.float64)
pred_val = [sparse.lil_matrix((USERS,ITEMS)) for _ in range(MAX_NEIGHBORS)]

# This implements the Pearson metric as found in equation (2.2)
def pearson(user1, user2, rated_items_1):
    if user1 > user2:
        temp = user1
        user1 = user2
        user2 = temp
    
    if pearson_matrix[user1, user2] != 0.0:
        return pearson_matrix[user1, user2]

    if user1 == user2:
        pearson_matrix[user1, user2] = 1
        return 1

    intersection = list(rated_items_1.intersection(rated_items(user2)))

    if len(intersection) <= 10:
        pearson_matrix[user1, user2] = -1
        pearson_matrix[user2, user1] = -1
        return -1

    user1_ratings = train_data_0.getrow(user1).toarray()[0][intersection]
    user2_ratings = train_data_0.getrow(user2).toarray()[0][intersection]
    ones = np.ones_like(user1_ratings)
    user1_ratings = user1_ratings - ones * averages[user1]
    user2_ratings = user2_ratings - ones * averages[user2]
    bottom1 = math.sqrt(np.sum(np.power(user1_ratings, 2)))
    bottom2 = math.sqrt(np.sum(np.power(user2_ratings, 2)))

    bottom1 = math.sqrt( sum([(train_data_0[user1,k] - averages[user1]) ** 2 for k in intersection]))
    bottom2 = math.sqrt( sum([(train_data_0[user2,k] - averages[user2]) ** 2 for k in intersection]))

    if bottom1 * bottom2 == 0:
        pearson_matrix[user1, user2] = -1
        return -1

    top = np.dot(user1_ratings, user2_ratings)
    pearson_val = top / (bottom1 * bottom2)
    pearson_matrix[user1, user2] = pearson_val

    return pearson_val

for user1 in range(USERS):

    if user1 % 100 == 0:
        print(user1)

    rated_items_1 = rated_items(user1)

    for item in test_data_0[user1].nonzero()[1]:
        rated_users = sparse.find(train_data_0.tocsc().getcol(item))[0]
        
        pearson_user = []

        for user2 in rated_users:
            pcc = pearson(user1, user2, rated_items_1)
            if pcc >= 0:
                
                heapq.heappush(pearson_user, (-1 * pcc , user2))
        
        top = 0
        bot = 0
        avg = averages[user1]

        for k in range(MAX_NEIGHBORS):

            if len(pearson_user) == 0:
                try:
                    pred_val[k][user1, item] = avg if k == 0 else avg + top/bot
                except ZeroDivisionError:
                    print(user1, item, k)
                    pred_val[k][user1, item] = avg
                continue
            pear, user2 = heapq.heappop(pearson_user)
            
            # top = np.sum(pear * (train_data_0[user2,item] - averages[user2]) for pear, user2 in K_closest_users)
            # bot = np.sum(abs(pear) for pear, _ in K_closest_users)

            top += pear * (averages[user2] - train_data_0[user2,item])
            bot += abs(pear)

            try:
                pred_val[k][user1, item] = avg + top/bot
            except ZeroDivisionError:
                print(user1, item, k)
                pred_val[k][user1, item] = avg

            if r > 5:
                r = 5
            elif r < 0:
                r = 0

            pred_val[k][user1,item] = r
print(pred_val)
        

0
45 1101 1
45 1101 2
45 1101 3
45 1101 4
45 1101 5
45 1101 6
45 1101 7
45 1101 8
45 1101 9
45 1101 10
45 1101 11
45 1101 12
45 1101 13
45 1101 14
45 1101 15
45 1101 16
45 1101 17
45 1101 18
45 1101 19
45 1101 20
45 1101 21
45 1101 22
45 1101 23
45 1101 24
86 825 1
86 825 2
86 825 3
86 825 4
86 825 5
86 825 6
86 825 7
86 825 8
86 825 9
86 825 10
86 825 11
86 825 12
86 825 13
86 825 14
86 825 15
86 825 16
86 825 17
86 825 18
86 825 19
86 825 20
86 825 21
86 825 22
86 825 23
86 825 24
[<6040x3952 sparse matrix of type '<class 'numpy.float64'>'
	with 2683 stored elements in List of Lists format>, <6040x3952 sparse matrix of type '<class 'numpy.float64'>'
	with 2683 stored elements in List of Lists format>, <6040x3952 sparse matrix of type '<class 'numpy.float64'>'
	with 2683 stored elements in List of Lists format>, <6040x3952 sparse matrix of type '<class 'numpy.float64'>'
	with 2683 stored elements in List of Lists format>, <6040x3952 sparse matrix of type '<class 'numpy.float64'>'
	wit

In [26]:
# # Here we re
# ad in the ratings in u1.test
for k in range(MAX_NEIGHBORS):
    sparse.save_npz('pred_val_0_' + str(k) + '.npz', pred_val[k].tocsr())
# pred_val = sparse.load_npz("pred_val_0_5.npz")

In [16]:
# RMSE as found in equation 7.5
# Calculated using the average ratings of each user as a baseline

squared_error_avg = 0
count = 0

# for user in range(USERS):
#     for item in range(ITEMS):
#         if test_data_0[user, item] == 0:
#             continue
#         squared_error_avg += (pred_val[user, item] - test_data_0[user,item]) ** 2
#         count += 1

squared_error_avg = (pred_val - test_data_0).power(2).sum()
RMSE_avg = math.sqrt(squared_error_avg / test_data_0.count_nonzero())
print(RMSE_avg)

0.9650211325929156


In [24]:
from itertools import combinations

def user_kendall_coef(user):
    credit = 0
    i = 0
    test_items = set(test_data_0[user].nonzero()[1])

    if len(test_items) <= 1:
        raise ZeroDivisionError

    for item1, item2 in combinations(test_items, 2):
        val = (test_data_0[user,item1] - test_data_0[user,item2]) * (pred_val[user,item1] - pred_val[user,item2])
        if val > 0:
            i += 1
            credit += 1
        elif val < 0:
            i += 1
            credit -= 1
    
    return credit / i

count = 0
total_ken = 0
for user in range(100):
    if user % 100 == 0:
        print(user)
    try:
        total_ken += user_kendall_coef(user)
        count += 1
    except ZeroDivisionError:
        print(user, " error")
        continue

print(total_ken / count)

0
0.36599757481862794


In [None]:
# try variety of neighborhood sizes
# try hitrate

In [34]:
squared_error_avg = 0

for user in range(USERS):
    if user % 100 == 0:
        print(user)
    for item in test_data_0[user].indices:
        
        if test_data_0[user, item] != 0:
            squared_error_avg += (averages[user] - test_data_0[user, item]) ** 2

RMSE_avg = math.sqrt(squared_error_avg / test_data_0.count_nonzero())

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000


In [35]:
print(RMSE_avg)

1.034051748055273


In [None]:
# Iterate through a range of neighborhood sizes
for K in range(2,21):
    # Predict ratings for each user-item pair in the test data.
    testing_size = 0
    error_count = 0
    u1_pred = np.zeros((USERS,ITEMS))
    for user in range(USERS):
        for item in range(ITEMS):
            if u1_test_data[user][item] == 0:
                continue
            try:
                pred_val = r_hat(user, item, K)
                u1_pred[user][item] = pred_val
                testing_size += 1
            except ZeroDivisionError:
                continue
            
    count = 0
    squared_error = 0

    # calculate the RMSE for each neighborhood size
    for user in range(USERS):
        for item in range(ITEMS):
            if u1_pred[user][item] == 0:
                continue
            squared_error += (u1_pred[user][item] - u1_test_data[user][item]) ** 2
            count += 1
    RMSE = math.sqrt(squared_error / count)

    # calculate the average Kendall rank correlation coefficient for each neighborhood size
    count = 0
    total_ken = 0
    for user in range(USERS):
        try:
            total_ken += user_kendall_coef(user)
            count += 1
        except ZeroDivisionError:
            continue

    print(K, RMSE, total_ken / count)