# Neighborhood Based Recommender Systems
Here we implement a user-based recommender system on the MovieLens 100k Dataset.

In [1]:
# Importing packages
import numpy as np
import math
from scipy import sparse

## Movie Lens 20M
Here we work with the Movie Lens 20M data set.

In [2]:
# numbers of users and items

ITEMS = 131262
USERS = 138493

In [5]:
# This function reads data from file and writes it to target, a numpy array
def read_data(file, target):
    with open(file) as f:
        line = f.readline()
        line = f.readline()
        while line:
            try:
                user, item, rating, _ = line.split(sep=',')
                target[int(user)-1, int(item)-1] = int(float(rating) * 2)
            except ValueError:
                print(line) 
            line = f.readline()



In [6]:
# Here we read in the ratings in 20M

#data_20m = np.zeros((USERS, ITEMS), dtype=int)

data_20m = sparse.lil_matrix((USERS, ITEMS), dtype=np.int8)

read_data(".\\ml-20M\\rating.csv", data_20m)


In [7]:
print(data_20m.count_nonzero())

20000263


In [10]:
data_20m = data_20m.tocsr()

In [11]:
sparse.save_npz("data_20m", data_20m)

In [12]:
data_20m_loaded = sparse.load_npz("data_20m.npz")


In [13]:
print(data_20m_loaded.count_nonzero())

20000263


In [15]:
print(type(data_20m_loaded))

<class 'scipy.sparse.csr.csr_matrix'>


In [None]:
# Todo: split into train and testing sets

In [5]:
# Generate the average rating for each user

averages = [np.average(user, weights=user.astype(bool)) for user in u1_train_data]

# Find which items are rated by each user and which users rated each item
rated_items = [{k for k in range(ITEMS) if u1_train_data[i][k] > 0} for i in range(USERS)]
rated_users = [{k for k in range(USERS) if u1_train_data[k][i] > 0} for i in range(ITEMS)]

Now that most of the prep work is done, it's time to calculate the matrix of Pearson correlation values.

In [6]:
# This implements the Pearson metric as found in equation (2.2)

def pearson(user1, user2):
    intersection = rated_items[user1].intersection(rated_items[user2])
    if len(intersection) <= 1:
        return 0

    bottom1 = math.sqrt( sum([(u1_train_data[user1][k] - averages[user1]) ** 2 for k in intersection]))
    bottom2 = math.sqrt( sum([(u1_train_data[user2][k] - averages[user2]) ** 2 for k in intersection]))

    if bottom1 * bottom2 == 0:
        return 0
        
    top = sum([(u1_train_data[user1][k] - averages[user1]) * (u1_train_data[user2][k] - averages[user2]) for k in intersection])

    return top / (bottom1 * bottom2)

In [7]:
# I don't know if this is the way to go about this.
# It takes a long time to run. However, I think it'set
# necessary because we'll have to calculate these values
# sooner or later anyways.

pearson_matrix = np.zeros((USERS,USERS))
for user1 in range(USERS):
    # since the matrix is symmetric, we can save time by only calculating half the values
    for user2 in range(USERS):
        if user1 == user2:
            pearson_matrix[user1][user2] = 1
            continue
        pm = pearson(user1,user2) 
        if pm > 1:
            pm = 1
        pearson_matrix[user1][user2] = pm
        pearson_matrix[user1][user2] = pm
np.savetxt("pearsonmatrix.csv", pearson_matrix, delimiter=', ', newline='\n')

In [8]:
# Of the users who rated item, return the k with highest pearson correlation value with a specified user

def k_closest(user, item, k):
    pearson_sorted = sorted((i for i in range(USERS) if i in rated_users[item]), key=lambda x: pearson_matrix[user][x])[-k:]
    return pearson_sorted

In [9]:
# Implement r-hat as in equation 2.4

def r_hat(user, item, K):
    K_closest_users = k_closest(user, item, K)
    # Come up with something to do if the k_closest is empty
    if len(K_closest_users) == 0:
        raise ZeroDivisionError
    top = sum(pearson_matrix[user][user2] * (u1_train_data[user2][item] - averages[user2]) for user2 in K_closest_users)
    bot = sum(abs(pearson_matrix[user][user2]) for user2 in K_closest_users)

    r = averages[user] + top/bot
    if r < 0:
        return 0
    elif r > 5:
        return 5
    else:
        return r

In [10]:
# Here we read in the ratings in u1.test
u1_test_data = np.zeros((USERS,ITEMS))
read_data(".\\ml-100k\\u1.test", u1_test_data)

In [11]:
# RMSE as found in equation 7.5
# Calculated using the average ratings of each user as a baseline

squared_error_avg = 0
count = 0

for user in range(USERS):
    for item in range(ITEMS):
        if u1_train_data[user][item] == 0:
            continue
        squared_error_avg += (averages[user] - u1_test_data[user][item]) ** 2
        count += 1

RMSE_avg = math.sqrt(squared_error_avg / count)
print(RMSE_avg)

3.5566819767721967


In [12]:
from itertools import combinations

def user_kendall_coef(user):
    credit = 0
    test_items = {k for k in range(ITEMS) if u1_test_data[user][k] > 0}

    if len(test_items) <= 1:
        raise ZeroDivisionError

    for item1, item2 in combinations(test_items, 2):
        val = (u1_test_data[user][item1] - u1_test_data[user][item2]) * (u1_pred[user][item1] - u1_pred[user][item2])
        if val > 0:
            credit += 1
        elif val < 0:
            credit -= 1
    
    return credit / (len(test_items) * (len(test_items) - 1) / 2)

In [13]:
# Iterate through a range of neighborhood sizes
for K in range(2,21):
    # Predict ratings for each user-item pair in the test data.
    testing_size = 0
    error_count = 0
    u1_pred = np.zeros((USERS,ITEMS))
    for user in range(USERS):
        for item in range(ITEMS):
            if u1_test_data[user][item] == 0:
                continue
            try:
                pred_val = r_hat(user, item, K)
                u1_pred[user][item] = pred_val
                testing_size += 1
            except ZeroDivisionError:
                continue
            
    count = 0
    squared_error = 0

    # calculate the RMSE for each neighborhood size
    for user in range(USERS):
        for item in range(ITEMS):
            if u1_pred[user][item] == 0:
                continue
            squared_error += (u1_pred[user][item] - u1_test_data[user][item]) ** 2
            count += 1
    RMSE = math.sqrt(squared_error / count)

    # calculate the average Kendall rank correlation coefficient for each neighborhood size
    count = 0
    total_ken = 0
    for user in range(USERS):
        try:
            total_ken += user_kendall_coef(user)
            count += 1
        except ZeroDivisionError:
            continue

    print(K, RMSE, total_ken / count)

2 1.135317748187173 0.14915125223746123
3 1.0774527032409333 0.1722115588470126
4 1.0415041854560423 0.19237317078129587
5 1.0212206141077282 0.20129084569458713
6 1.0066203436931822 0.21073500614459229
7 0.9980322496770825 0.21890897376472332
8 0.9911461830144686 0.21925571038164918
9 0.9869460418144786 0.22635889273919402
10 0.9827357832321977 0.2261315872402635
11 0.9796143637402012 0.2272606610514393
12 0.9768292341119547 0.22709403090628605
13 0.9740905525995786 0.22933660745408352
14 0.9724874663265057 0.2305910698876082
15 0.970593944100322 0.2371814474145983
16 0.9685490967601817 0.2384216194078346
17 0.9676524465805032 0.24051305383545296
18 0.9666947287195105 0.24279247944553908
19 0.9656160746038881 0.2481937308628679
20 0.9646901658999121 0.2506024992788578
