In [159]:
from numpy import linalg as LA
import numpy as np
import io
import random

In [33]:
# build movie dictionary with line no as numpy movie id ,its actual movie id as the key.
def build_movies_dict(movies_file):
    i = 0
    movie_id_dict = {}
    with io.open(movies_file, 'r', encoding="utf8") as f:
        for line in f:
            if i == 0:
                i = i+1
            else:
                movieId = line.split(',')[0]
                movie_id_dict[int(movieId)] = i-1
                i = i+1
    return movie_id_dict

In [None]:
def read_data_grid_search(input_file, movies_dict):
    # creating a dictionary, key: (user_id, movie_id), value: rating
    X = {}
    
    # because we don't have a header row now
    i = 1

    with open(input_file,'r') as f:
        for line in f:

            # to escape the header row
            if i == 0:
                i += 1
            else:
                user, movie_id, rating, timestamp = line.split(',')
                m_id = movies_dict[int(movie_id)]

                X[(int(user)-1, m_id)] = float(rating)
                
                u_id = int(user)-1
                
                i += 1
    return X

In [155]:
def matrix_factor_sgd(X, rank, lambd):
  
    V = np.random.rand(u, rank)
    W = np.random.rand(m, rank)
    
    for epoch in range(epochs):
        print(epoch)
        for key, value in X.items():
            i = key[0]
            j = key[1]
            rating = value
            eij = rating - np.dot(V[i], W[j])

            V_update = V[i] + alpha * (2.0 * eij * W[j] - (lambd*2.0 * V[i]))
            W_update = W[j] + alpha * (2.0 * eij * V[i] - (lambd*2.0 * W[j]))

            V[i] = V_update
            W[j] = W_update

    return V, W

In [115]:
def calc_rmse(X_test, V, W):
    e = 0
    for key, value in X_test.items():
        i = key[0]
        j = key[1]
        rating = value
        e += np.power(rating - np.dot(V[i],W[j]), 2)  # square error

    e = e/len(X_test) # mean
    rmse = np.sqrt(e) # root
    return rmse

In [111]:
def calc_mrr(X_test, V, W):
    mrr = 0

    # dict of only >= 3 ratings
    user_high_rated_movies_dict = {} # key: user_id, value: list of movie_ids
    for key, value in X_test.items():
        i = key[0]
        j = key[1]
        if value >= 3:
            if i in user_high_rated_movies_dict:
                user_high_rated_movies_dict[i].append(j)
            else:
                user_high_rated_movies_dict[i] = []
                user_high_rated_movies_dict[i].append(j)

    # creating a ranking dict, key: user index, value: (j(movie_index), predicted_rating)
    predicted_ratings = {}
    for key, value in X_test.items():
        i = key[0]
        j = key[1]
        predicted_rating = np.dot(V[i], W[j])

        if i in predicted_ratings:
            predicted_ratings[i].append((j, predicted_rating))
        else:
            predicted_ratings[i] = []
            predicted_ratings[i].append((j, predicted_rating))

    no_of_users = len(predicted_ratings)

    # going over each user and creating a ranking
    for key in predicted_ratings:
        list_of_ratings = predicted_ratings[key]
        sorted_list_of_ratings = sorted(list_of_ratings, key=lambda tup: tup[1], reverse=True)
        predicted_ratings[key] = sorted_list_of_ratings
        
        if key in user_high_rated_movies_dict:
            list_high_rated_movies_per_user = user_high_rated_movies_dict[key]
            count = len(list_high_rated_movies_per_user)
            sum = 0
            for movie in list_high_rated_movies_per_user:
                rank = sorted_list_of_ratings.index((movie, np.dot(V[key],W[movie]))) + 1  # index will return 0
                sum += 1/rank

            mrr += sum/count

    # mrr averaged over number of users
    return mrr/no_of_users

In [157]:
# below are the parameters used for grid-search
rank = [1, 8, 16, 32, 64, 128]
lambd = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
epochs = 2
alpha = 0.02
u = 138493
m = 27278
ratings_file = "ml-20m/ratings.csv"
train_ratings_file_1 = "train_data.csv"
test_ratings_file_1 = "test_data.csv"

train_ratings_file_2 = "train_data-2.csv"
test_ratings_file_2 = "test_data-2.csv"

train_ratings_file_3 = "train_data-3.csv"
test_ratings_file_3 = "test_data-3.csv"

movies_mapping_file = "ml-20m/movies.csv"

In [152]:
movies_dict = build_movies_dict(movies_mapping_file)
train_data_dict_1 = read_data_grid_search(train_ratings_file_1, movies_dict)
test_data_dict_1 = read_data_grid_search(test_ratings_file_1, movies_dict)

print(len(train_data_dict_1))
print(len(test_data_dict_1))

train_data_dict_2 = read_data_grid_search(train_ratings_file_2, movies_dict)
test_data_dict_2 = read_data_grid_search(test_ratings_file_2, movies_dict)

print(len(train_data_dict_2))
print(len(test_data_dict_2))


train_data_dict_3 = read_data_grid_search(train_ratings_file_3, movies_dict)
test_data_dict_3 = read_data_grid_search(test_ratings_file_3, movies_dict)

print(len(train_data_dict_3))
print(len(test_data_dict_3))

13287329
6712561
13380393
6619497
13332058
6667832


In [158]:
for r in rank:
    for l in lambd:
        print("rank: " + str(r))
        print("lambda: " + str(l))
        
        estimated_V_1, estimated_W_1 = matrix_factor_sgd(train_data_dict_1, r, l)
        print("rmse_1:" + str(calc_rmse(test_data_dict_1, estimated_V_1, estimated_W_1)))
        print("mrr_1:" + str(calc_mrr(test_data_dict_1, estimated_V_1, estimated_W_1)))
        
        estimated_V_2, estimated_W_2 = matrix_factor_sgd(train_data_dict_2, r, l)
        print("rmse_2:" + str(calc_rmse(test_data_dict_2, estimated_V_2, estimated_W_2)))
        print("mrr_2:" + str(calc_mrr(test_data_dict_2, estimated_V_2, estimated_W_2)))
        
        estimated_V_3, estimated_W_3 = matrix_factor_sgd(train_data_dict_3, r, l)
        print("rmse_fold3:" + str(calc_rmse(test_data_dict_3, estimated_V_3, estimated_W_3)))
        print("mrr_fold3:" + str(calc_mrr(test_data_dict_3, estimated_V_3, estimated_W_3)))
        
    print("------------------------------------------------------------------")

rank: 1
lambda: 0.01
0
1
rmse_1:1.01943421864
mrr_1:0.18565102186173052
0
1
rmse_2:1.59832940579
mrr_2:0.19322715862511108
0
1
rmse_fold3:1.02988178982
mrr_fold3:0.18980753132247966
rank: 1
lambda: 0.02
0
1
rmse_1:1.01217439257
mrr_1:0.1856364066641733
0
1
rmse_2:1.51362477696
mrr_2:0.19320435716172582
0
1
rmse_fold3:1.00981504687
mrr_fold3:0.1896894459253265
rank: 1
lambda: 0.05
0
1
rmse_1:1.00527347757
mrr_1:0.18558185742947544
0
1
rmse_2:1.36429555074
mrr_2:0.19313984136736373
0
1
rmse_fold3:0.980800307627
mrr_fold3:0.18954204315459933
rank: 1
lambda: 0.1
0
1
rmse_1:1.00650052466
mrr_1:0.18554760276765322
0
1
rmse_2:1.21106094906
mrr_2:0.19315052250627854
0
1
rmse_fold3:0.964307244542
mrr_fold3:0.18943832302966473
rank: 1
lambda: 0.2
0
1
rmse_1:1.02268223923
mrr_1:0.18551531268935345
0
1
rmse_2:1.1127241209
mrr_2:0.1933489389601274
0
1
rmse_fold3:0.960876593358
mrr_fold3:0.1893558625647149
rank: 1
lambda: 0.5
0
1
rmse_1:1.12729728951
mrr_1:0.1855693896080119
0
1
rmse_2:1.15867956363

  from ipykernel import kernelapp as app
  
  
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


1
rmse_1:nan


ValueError: (4034, nan) is not in list