In [1]:
import pandas as pd
import pickle
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import os
from sklearn.utils import shuffle
from datetime import datetime
from sortedcontainers import SortedList

## Load Data

In [2]:
df = pd.read_csv('../../data/movielens/rating.csv')

## Preprocessing Data

User IDS are sequential from 1-138493. We will re-inded them to be 0 based.

In [3]:
df.userId = df.userId - 1

Movie IDS are 1 - 131262, but not all IDS are used. We will create a new 0 based index for movies.

In [4]:
# create a mapping for movie ids
unique_movie_ids = set(df.movieId.values)
movie2idx = {}
count = 0
for movie_id in unique_movie_ids:
    movie2idx[movie_id] = count
    count += 1
    
# add them to the data frame
df['movie_idx'] = df.apply(lambda row: movie2idx[row.movieId], axis=1)

In [5]:
# removing timestamp, we don't need it
df = df.drop(columns=['timestamp'])

## Taking a Sample

We're going to work with the top 5K users and 2K movies.

In [6]:
df.shape

(20000263, 4)

In [7]:
# number of users
N = df.userId.max() + 1 
# number of movies
M = df.movie_idx.max() + 1

user_ids_count = Counter(df.userId)
movie_ids_count = Counter(df.movie_idx)

# number of users and movies to keep
n = 5000
m = 2000

user_ids = [u for u, c in user_ids_count.most_common(n)]
movie_ids = [m for m, c in movie_ids_count.most_common(m)]

keep_mask = df.userId.isin(user_ids) & df.movie_idx.isin(movie_ids)
df_small = df.loc[keep_mask].copy()

In [8]:
# need to remake user ids and movie ids since they are no longer sequential
new_user_id_map = {}
i = 0
for old in user_ids:
    new_user_id_map[old] = i
    i += 1
print("i:", i)

new_movie_id_map = {}
j = 0
for old in movie_ids:
    new_movie_id_map[old] = j
    j += 1
print("j:", j)

df_small.loc[:, 'userId'] = df_small \
    .apply(lambda row: new_user_id_map[row.userId], axis=1)
df_small.loc[:, 'movie_idx'] = df_small \
    .apply(lambda row: new_movie_id_map[row.movie_idx], axis=1)

print("max user id:", df_small.userId.max())
print("max movie id:", df_small.movie_idx.max())
print("small dataframe size:", len(df_small))

i: 5000
j: 2000
max user id: 4999
max movie id: 1999
small dataframe size: 3399948


## Create Train/Test Split

In [10]:
N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 # number of movies

df = shuffle(df)
cutoff = int(0.8*len(df))

df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]

In [None]:
# map users -> list of movies ated
user2movie = {}

# map movies -> list of users who have rated
movie2user = {}

# map user-movie pairs -> ratings
usermovie2rating = {}

count = 0
def update_user2movie_and_movie2user(row):
    global count
    count += 1
    if count % 100000 == 0:
        print("processed: %.3f" % (float(count)/cutoff))

    i = int(row.userId)
    j = int(row.movie_idx)
    if i not in user2movie:
        user2movie[i] = [j]
    else:
        user2movie[i].append(j)

    if j not in movie2user:
        movie2user[j] = [i]
    else:
        movie2user[j].append(i)
    usermovie2rating[(i,j)] = row.rating
    
    
df_train.apply(update_user2movie_and_movie2user, axis=1)

In [None]:
# test ratings dictionary
usermovie2rating_test = {}

count = 0
def update_usermovie2rating_test(row):
    global count
    count += 1
    if count % 100000 == 0:
        print("processed: %.3f" % (float(count)/len(df_test)))
    i = int(row.userId)
    j = int(row.movie_idx)
    usermovie2rating_test[(i,j)] = row.rating

df_test.apply(update_usermovie2rating_test, axis=1)

## Computing User Similarity Weights

In [13]:
N = np.max(list(user2movie.keys())) + 1

# the test set may contain movies the train set doesn't have data on
m1 = np.max(list(movie2user.keys()))
m2 = np.max([m for (u, m), r in usermovie2rating_test.items()])

M = max(m1, m2) + 1
print("N:", N, "M:", M)

N: 5000 M: 2000


In [None]:
# to find the item similarities, you have to do O(M^2 * N) calculations!
# in the "real-world" you'd want to parallelize this
# note: we really only have to do half the calculations, since w_ij is symmetric

# number of neighbors we'd like to consider
K = 20

# number of common movies users must have in common in order to consider
limit = 5 

# store neighbors in this list
neighbors = [] 

# each user's average rating for later use
averages = [] 

# each user's deviation for later use
deviations = []


for i in range(M):
    # find the 25 closest items to item i
    users_i = movie2user[i]
    users_i_set = set(users_i)

    # calculate avg and deviation
    ratings_i = { user:usermovie2rating[(user, i)] for user in users_i }
    avg_i = np.mean(list(ratings_i.values()))
    dev_i = { user:(rating - avg_i) for user, rating in ratings_i.items() }
    dev_i_values = np.array(list(dev_i.values()))
    sigma_i = np.sqrt(dev_i_values.dot(dev_i_values))

    # save these for later use
    averages.append(avg_i)
    deviations.append(dev_i)

    sl = SortedList()
    for j in range(M):
        if i == j:
            continue
        users_j = movie2user[j]
        users_j_set = set(users_j)
        common_users = (users_i_set & users_j_set) # intersection
        if len(common_users) > limit:
            # calculate avg and deviation
            ratings_j = { user:usermovie2rating[(user, j)] for user in users_j }
            avg_j = np.mean(list(ratings_j.values()))
            dev_j = { user:(rating - avg_j) for user, rating in ratings_j.items() }
            dev_j_values = np.array(list(dev_j.values()))
            sigma_j = np.sqrt(dev_j_values.dot(dev_j_values))

            # calculate correlation coefficient
            numerator = sum(dev_i[m]*dev_j[m] for m in common_users)
            w_ij = numerator / (sigma_i * sigma_j)

            # insert into sorted list and truncate
            # negate weight, because list is sorted ascending
            # maximum value (1) is "closest"
            sl.add((-w_ij, j))
            if len(sl) > K:
              del sl[-1]

    # store the neighbors
    neighbors.append(sl)

    # print out useful things
    if i % 1 == 0:
        print(i)

## Make Recommendations

In [17]:
def predict(i, u):
    # calculate the weighted sum of deviations
    numerator = 0
    denominator = 0
    for neg_w, j in neighbors[i]:
        # remember, the weight is stored as its negative
        # so the negative of the negative weight is the positive weight
        try:
            numerator += -neg_w * deviations[j][u]
            denominator += abs(neg_w)
        except KeyError:
      # neighbor may not have rated the same movie
      # don't want to do dictionary lookup twice
      # so just throw exception
          pass

    if denominator == 0:
        prediction = averages[i]
    else:
        prediction = numerator / denominator + averages[i]
    prediction = min(5, prediction)
    prediction = max(0.5, prediction) # min rating is 0.5
    return prediction

In [18]:
train_predictions = []
train_targets = []
for (u, m), target in usermovie2rating.items():
    # calculate the prediction for this movie
    prediction = predict(m, u)

    # save the prediction and target
    train_predictions.append(prediction)
    train_targets.append(target)

test_predictions = []
test_targets = []
# same thing for test set
for (u, m), target in usermovie2rating_test.items():
    # calculate the prediction for this movie
    prediction = predict(m, u)

    # save the prediction and target
    test_predictions.append(prediction)
    test_targets.append(target)

In [19]:
# calculate accuracy
def mse(p, t):
    p = np.array(p)
    t = np.array(t)
    return np.mean((p - t)**2)

print('train mse:', mse(train_predictions, train_targets))
print('test mse:', mse(test_predictions, test_targets))

train mse: 0.5212058103034772
test mse: 0.5540593313259357
