In [2]:
from numpy import linalg as LA
import numpy as np
import io
import random

In [1]:
epochs = 2
alpha = 0.02
u = 138493
m = 27278
ratings_file = "ml-20m/ratings.csv"

# the below 6 files are corresponding to the 3 separate random folds while splitting train and test data
train_ratings_file_1 = "train_data.csv"
test_ratings_file_1 = "test_data.csv"

train_ratings_file_2 = "train_data-2.csv"
test_ratings_file_2 = "test_data-2.csv"

train_ratings_file_3 = "train_data-3.csv"
test_ratings_file_3 = "test_data-3.csv"
movies_mapping_file = "ml-20m/movies.csv"

# These are the values which provided the minimum value of avg. RMSE
lambd = 0.05
rank = 16

In [3]:
# build movie dictionary with line no as numpy movie id ,its actual movie id as the key.
def build_movies_dict(movies_file):
    i = 0
    movie_id_dict = {}
    with io.open(movies_file, 'r', encoding="utf8") as f:
        for line in f:
            if i == 0:
                i = i+1
            else:
                movieId = line.split(',')[0]
                movie_id_dict[int(movieId)] = i-1
                i = i+1
    return movie_id_dict

In [4]:
def read_data(input_file, movies_dict):
    # creating a dictionary, key: (user_id, movie_id), value: rating
    X = {}
    user_to_movie_dict = {}
    movie_to_user_dict = {}

    # because we don't have a header row now
    i = 1

    with open(input_file,'r') as f:
        for line in f:

            # to escape the header row
            if i == 0:
                i += 1
            else:
                user, movie_id, rating, timestamp = line.split(',')
                m_id = movies_dict[int(movie_id)]

                X[(int(user)-1, m_id)] = float(rating)
                
                u_id = int(user)-1
                
                i += 1
    return X

In [5]:
# this part won't work unless the above specified files are present in the same location as this notebook.
# Because of their big size, I haven't included them in the submission but can email or submit again if required

movies_dict = build_movies_dict(movies_mapping_file)
train_data_dict_1 = read_data(train_ratings_file_1, movies_dict)
test_data_dict_1 = read_data(test_ratings_file_1, movies_dict)

print("Fold1 loaded")

train_data_dict_2 = read_data(train_ratings_file_2, movies_dict)
test_data_dict_2 = read_data(test_ratings_file_2, movies_dict)

print("Fold2 loaded")

train_data_dict_3 = read_data(train_ratings_file_3, movies_dict)
test_data_dict_3 = read_data(test_ratings_file_3, movies_dict)

print("Fold3 loaded")
print("Data loading complete into 3 sets of train-test data")

Fold1 loaded
Fold2 loaded
Fold3 loaded
Data loading complete into 3 sets of train-test data


In [11]:
def matrix_factor_sgd(X):
    V = np.random.rand(u, rank)
    W = np.random.rand(m, rank)
    
    for epoch in range(epochs):
        # print(epoch)
        for key, value in X.items():
            i = key[0]
            j = key[1]
            rating = value
            eij = rating - np.dot(V[i], W[j])

            V_update = V[i] + alpha * (2.0 * eij * W[j] - (lambd*2.0 * V[i]))
            W_update = W[j] + alpha * (2.0 * eij * V[i] - (lambd*2.0 * W[j]))

            V[i] = V_update
            W[j] = W_update

    return V, W

In [7]:
def calc_rmse(X_test, V, W):
    e = 0
    for key, value in X_test.items():
        i = key[0]
        j = key[1]
        rating = value
        e += np.power(rating - np.dot(V[i],W[j]), 2)  # square error

    e = e/len(X_test) # mean
    rmse = np.sqrt(e) # root
    return rmse

In [8]:
def calc_mrr(X_test, V, W):
    mrr = 0

    # dict of only >= 3 ratings
    user_high_rated_movies_dict = {} # key: user_id, value: list of movie_ids
    for key, value in X_test.items():
        i = key[0]
        j = key[1]
        if value >= 3:
            if i in user_high_rated_movies_dict:
                user_high_rated_movies_dict[i].append(j)
            else:
                user_high_rated_movies_dict[i] = []
                user_high_rated_movies_dict[i].append(j)

    # creating a ranking dict, key: user index, value: (j(movie_index), predicted_rating)
    predicted_ratings = {}
    for key, value in X_test.items():
        i = key[0]
        j = key[1]
        predicted_rating = np.dot(V[i], W[j])

        if i in predicted_ratings:
            predicted_ratings[i].append((j, predicted_rating))
        else:
            predicted_ratings[i] = []
            predicted_ratings[i].append((j, predicted_rating))

    no_of_users = len(predicted_ratings)

    # going over each user and creating a ranking
    for key in predicted_ratings:
        list_of_ratings = predicted_ratings[key]
        sorted_list_of_ratings = sorted(list_of_ratings, key=lambda tup: tup[1], reverse=True)
        predicted_ratings[key] = sorted_list_of_ratings
        
        if key in user_high_rated_movies_dict:
            list_high_rated_movies_per_user = user_high_rated_movies_dict[key]
            count = len(list_high_rated_movies_per_user)
            sum = 0
            for movie in list_high_rated_movies_per_user:
                rank = sorted_list_of_ratings.index((movie, np.dot(V[key],W[movie]))) + 1  # index will return 0
                sum += 1/rank

            mrr += sum/count

    # mrr averaged over number of users
    return mrr/no_of_users

In [13]:
estimated_V_1, estimated_W_1 = matrix_factor_sgd(train_data_dict_1)
rmse1 = calc_rmse(test_data_dict_1, estimated_V_1, estimated_W_1)
mrr1 = calc_mrr(test_data_dict_1, estimated_V_1, estimated_W_1)
print("Calculation done for fold 1")
        
estimated_V_2, estimated_W_2 = matrix_factor_sgd(train_data_dict_2)
rmse2 = calc_rmse(test_data_dict_2, estimated_V_2, estimated_W_2)
mrr2 = calc_mrr(test_data_dict_2, estimated_V_2, estimated_W_2)
print("Calculation done for fold 2")

estimated_V_3, estimated_W_3 = matrix_factor_sgd(train_data_dict_3)
rmse3 = calc_rmse(test_data_dict_3, estimated_V_3, estimated_W_3)
mrr3 = calc_mrr(test_data_dict_3, estimated_V_3, estimated_W_3)
print("Calculation done for fold 3")
print("Overall results:")

print(str((rmse1 + rmse2 + rmse3)/3))
print(str((mrr1 + mrr2 + mrr3)/3))

Calculation done for fold 1
Calculation done for fold 2
Calculation done for fold 3
Overall results:
0.948972089747
0.18880896755651078
