# Neighborhood Based Recommender Systems
Here we implement a user-based recommender system on the MovieLens 100k Dataset.

In [1]:
# Importing packages
import numpy as np
import math

In [2]:
K = 5

# numbers of users and items from u.info

ITEMS = 1682
USERS = 943

## u1 data set
Here we work with the u1 data set.

In [3]:
# This function reads data from file and writes it to target, a numpy array
def read_data(file, target):
    with open(file) as f:
        line = f.readline()
        while line:
            user, item, rating, _ = line.split()
            target[int(user)-1][int(item)-1] = int(rating)
            line = f.readline()

In [4]:
# Here we read in the ratings in u1.base

u1_train_data = np.zeros((USERS, ITEMS))

read_data(".\\ml-100k\\u1.base", u1_train_data)


In [5]:
# Generate the average rating for each user

averages = [np.average(user, weights=user.astype(bool)) for user in u1_train_data]

# Find which items are rated by each user and which users rated each item
rated_items = [{k for k in range(ITEMS) if u1_train_data[i][k] > 0} for i in range(USERS)]
rated_users = [{k for k in range(USERS) if u1_train_data[k][i] > 0} for i in range(ITEMS)]

Now that most of the prep work is done, it's time to calculate the matrix of Pearson correlation values.

In [6]:
# This implements the Pearson metric as found in equation (2.2)

def pearson(user1, user2):
    intersection = rated_items[user1].intersection(rated_items[user2])
    if len(intersection) <= 1:
        return 0

    bottom1 = math.sqrt( sum([(u1_train_data[user1][k] - averages[user1]) ** 2 for k in intersection]))
    bottom2 = math.sqrt( sum([(u1_train_data[user2][k] - averages[user2]) ** 2 for k in intersection]))

    if bottom1 * bottom2 == 0:
        return 0
        
    top = sum([(u1_train_data[user1][k] - averages[user1]) * (u1_train_data[user2][k] - averages[user2]) for k in intersection])

    return top / (bottom1 * bottom2)

In [7]:
# I don't know if this is the way to go about this.
# It takes a long time to run. However, I think it'set
# necessary because we'll have to calculate these values
# sooner or later anyways.

pearson_matrix = np.zeros((USERS,USERS))
for user1 in range(USERS):
    # since the matrix is symmetric, we can save time by only calculating half the values
    for user2 in range(USERS):
        if user1 == user2:
            pearson_matrix[user1][user2] = 1
            continue
        pm = pearson(user1,user2) 
        if pm > 1:
            pm = 1
        pearson_matrix[user1][user2] = pm
        pearson_matrix[user1][user2] = pm
np.savetxt("pearsonmatrix.csv", pearson_matrix, delimiter=', ', newline='\n')

In [8]:
# Of the users who rated item, return the k with highest pearson correlation value with a specified user

def k_closest(user, item, k):
    pearson_sorted = sorted((i for i in range(USERS) if i in rated_users[item]), key=lambda x: pearson_matrix[user][x])[-k:]
    return pearson_sorted

In [9]:
# Implement r-hat as in equation 2.4

def r_hat(user, item):
    K_closest_users = k_closest(user, item, K)
    # Come up with something to do if the k_closest is empty
    if len(K_closest_users) == 0:
        raise ZeroDivisionError
    top = sum(pearson_matrix[user][user2] * (u1_train_data[user2][item] - averages[user2]) for user2 in K_closest_users)
    bot = sum(abs(pearson_matrix[user][user2]) for user2 in K_closest_users)

    return averages[user] + top/bot

In [10]:
# Here we read in the ratings in u1.test
u1_test_data = np.zeros((USERS,ITEMS))
read_data(".\\ml-100k\\u1.test", u1_test_data)

In [12]:
# Predict ratings for each user-item pair in the test data.
testing_size = 0
error_count = 0
u1_pred = np.zeros((USERS,ITEMS))
for user in range(USERS):
    for item in range(ITEMS):
        if u1_test_data[user][item] == 0:
            continue
        try:
            pred_val = r_hat(user, item)
            u1_pred[user][item] = pred_val
            testing_size += 1
        except ZeroDivisionError:
            continue
            #print(user, item)
            # error_count += 1
#         if pred_val > 5 or pred_val < 0:
#             print(user, item, pred_val)
# print(testing_size, error_count)
# np.savetxt("pred.csv", u1_pred, delimiter=', ', newline='\n')

In [13]:
# RMSE as found in equation 7.5

count = 0
squared_error = 0

for user in range(USERS):
    for item in range(ITEMS):
        if u1_pred[user][item] == 0:
            continue
        squared_error += (u1_pred[user][item] - u1_test_data[user][item]) ** 2
        count += 1

RMSE = math.sqrt(squared_error / count)
print(RMSE)

1.0239996622010517
