# Neighborhood Based Recommender Systems
Here we implement a user-based recommender system on the MovieLens 100k Dataset.

In [1]:
# Importing packages
import numpy as np
import math

In [2]:
# numbers of users and items from u.info
# Hi Jason - Olivia
ITEMS = 1682
USERS = 943

## u1 data set
Here we work with the u1 data set.

In [3]:
# This function reads data from file and writes it to target, a numpy array
def read_data(file, target):
    with open(file) as f:
        line = f.readline()
        while line:
            user, item, rating, _ = line.split()
            target[int(user)-1][int(item)-1] = int(rating)
            line = f.readline()

In [4]:
# Here we read in the ratings in u1.base

u1_train_data = np.zeros((USERS, ITEMS))

read_data(".\\ml-100k\\u1.base", u1_train_data)


In [5]:
# Generate the average rating for each user

averages = [np.average(user, weights=user.astype(bool)) for user in u1_train_data]

# Find which items are rated by each user and which users rated each item
rated_items = [{k for k in range(ITEMS) if u1_train_data[i][k] > 0} for i in range(USERS)]
rated_users = [{k for k in range(USERS) if u1_train_data[k][i] > 0} for i in range(ITEMS)]

Now that most of the prep work is done, it's time to calculate the matrix of Pearson correlation values.

In [6]:
# This implements the Pearson metric as found in equation (2.2)

def pearson(user1, user2):
    intersection = rated_items[user1].intersection(rated_items[user2])
    if len(intersection) <= 1:
        return 0

    bottom1 = math.sqrt( sum([(u1_train_data[user1][k] - averages[user1]) ** 2 for k in intersection]))
    bottom2 = math.sqrt( sum([(u1_train_data[user2][k] - averages[user2]) ** 2 for k in intersection]))

    if bottom1 * bottom2 == 0:
        return 0
        
    top = sum([(u1_train_data[user1][k] - averages[user1]) * (u1_train_data[user2][k] - averages[user2]) for k in intersection])

    return top / (bottom1 * bottom2)

In [7]:
# I don't know if this is the way to go about this.
# It takes a long time to run. However, I think it'set
# necessary because we'll have to calculate these values
# sooner or later anyways.

pearson_matrix = np.zeros((USERS,USERS))
for user1 in range(USERS):
    # since the matrix is symmetric, we can save time by only calculating half the values
    for user2 in range(USERS):
        if user1 == user2:
            pearson_matrix[user1][user2] = 1
            continue
        pm = pearson(user1,user2) 
        if pm > 1:
            pm = 1
        pearson_matrix[user1][user2] = pm
        pearson_matrix[user1][user2] = pm
np.savetxt("pearsonmatrix.csv", pearson_matrix, delimiter=', ', newline='\n')

In [8]:
# Of the users who rated item, return the k with highest pearson correlation value with a specified user

def k_closest(user, item, k):
    pearson_sorted = sorted((i for i in range(USERS) if i in rated_users[item]), key=lambda x: pearson_matrix[user][x])[-k:]
    return pearson_sorted

In [9]:
# Implement r-hat as in equation 2.4

def r_hat(user, item, K):
    K_closest_users = k_closest(user, item, K)
    # Come up with something to do if the k_closest is empty
    if len(K_closest_users) == 0:
        raise ZeroDivisionError
    top = sum(pearson_matrix[user][user2] * (u1_train_data[user2][item] - averages[user2]) for user2 in K_closest_users)
    bot = sum(abs(pearson_matrix[user][user2]) for user2 in K_closest_users)

    return averages[user] + top/bot

In [10]:
# Here we read in the ratings in u1.test
u1_test_data = np.zeros((USERS,ITEMS))
read_data(".\\ml-100k\\u1.test", u1_test_data)

In [11]:
# RMSE as found in equation 7.5
# Calculated using the average ratings of each user as a baseline

squared_error_avg = 0
count = 0

for user in range(USERS):
    for item in range(ITEMS):
        if u1_train_data[user][item] == 0:
            continue
        squared_error_avg += (averages[user] - u1_test_data[user][item]) ** 2
        count += 1

RMSE_avg = math.sqrt(squared_error_avg / count)
print(RMSE_avg)

3.5566819767721967


In [25]:
from itertools import combinations

def user_kendall_coef(user):
    credit = 0
    test_items = {k for k in range(ITEMS) if u1_test_data[user][k] > 0}

    if len(test_items) <= 1:
        raise ZeroDivisionError

    for item1, item2 in combinations(test_items, 2):
        val = (u1_test_data[user][item1] - u1_test_data[user][item2]) * (u1_pred[user][item1] - u1_pred[user][item2])
        if val > 0:
            credit += 1
        elif val < 0:
            credit -= 1
    
    return credit / (len(test_items) * (len(test_items) - 1) / 2)

In [28]:
# Iterate through a range of neighborhood sizes
for K in range(2,21):
    # Predict ratings for each user-item pair in the test data.
    testing_size = 0
    error_count = 0
    u1_pred = np.zeros((USERS,ITEMS))
    for user in range(USERS):
        for item in range(ITEMS):
            if u1_test_data[user][item] == 0:
                continue
            try:
                pred_val = r_hat(user, item, K)
                u1_pred[user][item] = pred_val
                testing_size += 1
            except ZeroDivisionError:
                continue
            
    count = 0
    squared_error = 0

    # calculate the RMSE for each neighborhood size
    for user in range(USERS):
        for item in range(ITEMS):
            if u1_pred[user][item] == 0:
                continue
            squared_error += (u1_pred[user][item] - u1_test_data[user][item]) ** 2
            count += 1
    RMSE = math.sqrt(squared_error / count)

    # calculate the average Kendall rank correlation coefficient for each neighborhood size
    count = 0
    total_ken = 0
    for user in range(USERS):
        try:
            total_ken += user_kendall_coef(user)
            count += 1
        except ZeroDivisionError:
            continue

    print(K, RMSE, total_ken / count)

2 1.1471512227418752 0.14975463921548315
3 1.0832787727110045 0.17233290808699941
4 1.0454038909248713 0.19277338469043617
5 1.0239996622010517 0.20134443328930837
6 1.008972984940028 0.2104509200040167
7 0.9999452805411727 0.2190403578107192
8 0.9927131264171871 0.21913688307398607
9 0.9883334713150379 0.2261951630381763
10 0.9840243851860677 0.22588468894444083
11 0.9808089208383695 0.22726524222891462
12 0.977973031548584 0.22707379750826054
13 0.9751579727472001 0.22948748301322777
14 0.9735193959369345 0.23074415362878617
15 0.9716044487861084 0.23744638171501078
16 0.9695178634572917 0.23854863699821183
17 0.9685811441261292 0.24060656424976593
18 0.9676203482949787 0.24284339650480505
19 0.9665196173511043 0.2479393505259485
20 0.9655757162162828 0.2504336496396134
