# Latent Factor Model with Regularization

### This model is copied from the HW1 solution posted by Kyra and extended to include regularization of the beta parameters

In [30]:
# Import required packages
import csv
import numpy as np
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm_notebook as tqdm

In [31]:
from collections import defaultdict

def readFile(f):
  for l in open(f):
    yield eval(l)

In [57]:
# set run_final to true to train model on full dataset
# run_final is used when generating predictions for kaggle
run_final = True

In [58]:
if run_final:
    train_percent = 1
else:
    train_percent = 0.8
    
total_sample = 200000
train_sample = int(train_percent * total_sample)

allRatings = np.zeros(total_sample)   # store all ratings
train_user_diction = {}   # store data using userID as keys
train_item_diction = {}   # store data using itemID as keys
test_user_diction = {}   # store data using userID as keys
test_item_diction = {}   # store data using itemID as keys
data = np.zeros((total_sample, 3), dtype=object) # store entire data as an array

index = 0
for l in readFile("../data/train.json"):
    allRatings[index] = int(l['rating'])
    
    data[index] = l['reviewerID'],l['itemID'],int(l['rating'])
    if index < train_sample:
        if l['reviewerID'] not in train_user_diction:
            train_user_diction[l['reviewerID']] = [[l['itemID'],int(l['rating'])]]
        else:
            train_user_diction[l['reviewerID']].append([l['itemID'],int(l['rating'])])
        if l['itemID'] not in train_item_diction:
            train_item_diction[l['itemID']] = [[l['reviewerID'], int(l['rating'])]]
        else:
            train_item_diction[l['itemID']].append([l['reviewerID'], int(l['rating'])])
    else:
        if l['reviewerID'] not in test_user_diction:
            test_user_diction[l['reviewerID']] = [[l['reviewerID'], int(l['rating'])]]
        else:
            test_user_diction[l['reviewerID']].append([l['reviewerID'], int(l['rating'])])
        if l['itemID'] not in test_item_diction:
            test_item_diction[l['itemID']] = [[l['reviewerID'], int(l['rating'])]]
        else:
            test_item_diction[l['itemID']].append([l['reviewerID'], int(l['rating'])])
    index+=1

In [59]:
# model 1: alpha and beta_u

cutoff_user = 10
# initialize alpha and beta
train_user_list = list(train_user_diction.keys())
alpha_2 = 0
beta_u = {}
delta = 100  # difference between iterations
while delta >= 10**-5: # set the error tolerance 
    local_alpha = 0
    for user in train_user_list:   # we first update alpha
        user_rating = np.array(train_user_diction[user])[:,1]
        if user not in beta_u:
            local_alpha += np.sum(user_rating.astype(int))
        else:
            local_alpha += np.sum(user_rating.astype(int) - beta_u[user])
    local_alpha = local_alpha/train_sample
    
    for user in train_user_list:  # we then update beta
        user_rating = np.array(train_user_diction[user])[:,1]
        # beta_u[user] = np.sum(user_rating.astype(int) - local_alpha)/len(train_user_diction[user])
        beta_u[user] = min(1,len(train_user_diction[user])/cutoff_user)*np.sum(user_rating.astype(int) - local_alpha)/len(train_user_diction[user])
    
    delta = abs(local_alpha - alpha_2)  # calculate the difference of alphas between epoch
    alpha_2 = local_alpha # update global alpha


##### calculate training mse and testing mse #######################################################
test_user_list = list(test_user_diction.keys())
train_label = np.zeros(train_sample)
train_prediction = np.zeros(train_sample)
test_label = np.zeros(total_sample-train_sample)
test_prediction = np.zeros(total_sample - train_sample)
index = 0
for user in train_user_list:
    user_rating = np.array(train_user_diction[user])[:,1].astype(int)
    train_label[index: index+len(user_rating)] = user_rating
    train_prediction[index: index+len(user_rating)] = alpha_2 + beta_u[user]
    index += len(user_rating)

index = 0
for user in test_user_list:
    user_rating = np.array(test_user_diction[user])[:,1].astype(int)
    test_label[index: index+len(user_rating)] = user_rating
    if user not in train_user_list:
        test_prediction[index: index+len(user_rating)] = alpha_2
    else:
        test_prediction[index: index+len(user_rating)] = alpha_2 + beta_u[user]
    index += len(user_rating)
        
train_error_2 = mse(train_label, train_prediction)
print('model 2 train mse', train_error_2)
print('model 2 train rmse', np.sqrt(train_error_2))
if not run_final:
    test_error_2 = mse(test_label, test_prediction)
    print('model 2 testing mse', test_error_2)
    print('model 2 testing rmse', np.sqrt(test_error_2))

model 2 train mse 0.9266872608584007
model 2 train rmse 0.9626459685982177


In [35]:
if run_final:
    predictions = open("predictions_Rating_alpha_betau_reg{}.txt".format(cutoff_user), 'w')
    for l in open("../data/pairs_Rating.txt"):
        if l.startswith("reviewerID"):
            predictions.write(l) # header
            continue
        u,i = l.strip().split('-')
        if u in train_user_list:
            predictions.write(u + '-' + i + ',' + str(alpha_2 + beta_u[u]) + '\n')
        else:
            predictions.write(u + '-' + i + ',' + str(alpha_2) + '\n')

    predictions.close()

In [52]:
# model 2: alpha and beta_i

cutoff_item = 15
# initialize alpha and beta
train_item_list = list(train_item_diction.keys())
alpha_3 = 0
beta_i = {}
delta = 100
while delta >= 10**-5:
    local_alpha = 0
    for item in train_item_list: # we first update alpha
        item_rating = np.array(train_item_diction[item])[:,1]
        if item not in beta_i:
            local_alpha += np.sum(item_rating.astype(int))
        else:
            local_alpha += np.sum(item_rating.astype(int) - beta_i[item])
    local_alpha = local_alpha/train_sample
    
    for item in train_item_list: # we then update beta
        item_rating = np.array(train_item_diction[item])[:,1]
        # beta_i[item] = np.sum(item_rating.astype(int) - local_alpha)/len(train_item_diction[item])
        beta_i[item] = min(1,len(train_item_diction[item])/cutoff_item)*np.sum(item_rating.astype(int) - local_alpha)/len(train_item_diction[item])
    delta = abs(local_alpha - alpha_3)
    alpha_3 = local_alpha

test_item_list = list(test_item_diction.keys())
train_label = np.zeros(train_sample)
train_prediction = np.zeros(train_sample)
test_label = np.zeros(total_sample - train_sample)
test_prediction = np.zeros(total_sample - train_sample)
index = 0
for item in tqdm(train_item_list, total = len(train_item_list)):
    item_rating = np.array(train_item_diction[item])[:,1].astype(int)
    train_label[index: index+len(item_rating)] = item_rating
    train_prediction[index: index+len(item_rating)] = alpha_3 + beta_i[item]
    index += len(item_rating)

index = 0
for item in tqdm(test_item_list, total = len(test_item_list)):
    item_rating = np.array(test_item_diction[item])[:,1].astype(int)
    test_label[index: index+len(item_rating)] = item_rating
    if item not in train_item_list:
        test_prediction[index: index+len(item_rating)] = alpha_3
    else:
        test_prediction[index: index+len(item_rating)] = alpha_3 + beta_i[item]
    index += len(item_rating)
        
train_error_3 = mse(train_label, train_prediction)
print('model 3 train mse', train_error_3)
print('model 3 train rmse', np.sqrt(train_error_3))
if not run_final:
    test_error_3 = mse(test_label, test_prediction)
    print('model 3 testing mse', test_error_3)
    print('model 3 testing rmse', np.sqrt(test_error_3))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=19914.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


model 3 train mse 1.0709415550690493
model 3 train rmse 1.0348630610225922


In [54]:
if run_final:
    predictions = open("predictions_Rating_alpha_betai_reg{}.txt".format(cutoff_item), 'w')
    index = 0
    for l in open("../data/pairs_Rating.txt"):
        if l.startswith("reviewerID"):
            predictions.write(l) # header
            continue
        u,i = l.strip().split('-')
        if i in train_item_list:
            predictions.write(u + '-' + i + ',' + str(alpha_3 + beta_i[i]) + '\n')
        else:
            predictions.write(u + '-' + i + ',' + str(alpha_3) + '\n')
        index+=0

    predictions.close()

In [55]:
# model 3: alpha and beta_u and beta_i
cutoff_user = 10
cutoff_item = 15
# initialize alpha and beta
alpha_4 = 0
beta_u = {}
beta_i = {}
delta = 100
epoch = 0
while delta >= 10**-5:
    local_alpha = 0
    local_beta_u = {}
    local_beta_i = {}
    
    for user, item, rating in data[:train_sample]:  # we first update alpah
        if user not in beta_u:
            if item not in beta_i:
                local_alpha += int(rating)/train_sample
            else:
                local_alpha += (int(rating) - beta_i[item])/train_sample
        else:
            if item not in beta_i:
                local_alpha += (int(rating) - beta_u[user])/train_sample
            else:
                local_alpha += (int(rating) - beta_u[user] - beta_i[item])/train_sample
    delta = abs(alpha_4 - local_alpha)
    alpha_4 = local_alpha
    
    for user, item, rating in data[:train_sample]:  # we first update beta_u
        if item not in beta_i:
            if user not in local_beta_u:
                local_beta_u[user] = (int(rating) - alpha_4)/len(train_user_diction[user])
            else:
                local_beta_u[user] += (int(rating) - alpha_4)/len(train_user_diction[user])
        else:
            if user not in local_beta_u:
                local_beta_u[user] = (int(rating) - alpha_4 - beta_i[item])/len(train_user_diction[user])
            else:
                local_beta_u[user] += (int(rating) - alpha_4 - beta_i[item])/len(train_user_diction[user])
                
    for user in train_user_list: # we then update beta
        local_beta_u[user] = min(1,len(train_user_diction[user])/cutoff_user)*local_beta_u[user]
        
        
    if epoch !=0 :
        delta = max(delta, abs(list(local_beta_u.values())[0] - list(beta_u.values())[0]))
    beta_u = local_beta_u
    
    
    for user, item, rating in data[:train_sample]:  # we first update beta_i
        if item not in local_beta_i:
            local_beta_i[item] = (int(rating) - alpha_4 - beta_u[user])/len(train_item_diction[item])
        else:
            local_beta_i[item] += (int(rating) - alpha_4 - beta_u[user])/len(train_item_diction[item])
    
    for item in train_item_list: # we then update beta
        # beta_i[item] = np.sum(item_rating.astype(int) - local_alpha)/len(train_item_diction[item])
        local_beta_i[item] = min(1,len(train_item_diction[item])/cutoff_item)*local_beta_i[item]
        
    if epoch != 0:
        delta = max(delta, abs(list(local_beta_i.values())[0] - list(beta_i.values())[0]))
    
    beta_i = local_beta_i
    epoch += 1


train_prediction = np.zeros(train_sample)
test_prediction = np.zeros(total_sample - train_sample)
for index, (user, item, rating) in tqdm(enumerate(data[:train_sample]), total=train_sample):
    train_prediction[index] = alpha_4 + beta_u[user] + beta_i[item]

for index, (user, item, rating) in tqdm(enumerate(data[train_sample:]), total=total_sample-train_sample):
    if user in beta_u:
        if item in beta_i:
            test_prediction[index] = alpha_4 + beta_u[user] + beta_i[item]
        else:
            test_prediction[index] = alpha_4 + beta_u[user]
    else:
        if item in beta_i:
            test_prediction[index] = alpha_4 + beta_i[item]
        else:
            test_prediction[index] = alpha_4
            
        
train_error_4 = mse(allRatings[:train_sample], train_prediction)
print('model 4 train mse', train_error_4)
print('model 4 train rmse', np.sqrt(train_error_4))

if not run_final:
    test_error_4 = mse(allRatings[train_sample:], test_prediction)
    print('model 4 testing mse', test_error_4)
    print('model 4 testing rmse', np.sqrt(test_error_4))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=200000.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


model 4 train mse 0.8069862430497315
model 4 train rmse 0.8983241302835695


In [56]:
if run_final:
    predictions = open("predictions_Rating_alpha_betau_betai_reguser{}_regitem{}.txt".format(cutoff_user, cutoff_item), 'w')
    for l in open("../data/pairs_Rating.txt"):
        if l.startswith("reviewerID"):
            predictions.write(l) # header
            continue
        u,i = l.strip().split('-')
        if u in train_user_list:
            if i in train_item_list:
                predictions.write(u + '-' + i + ',' + str(alpha_4 + beta_i[i] + beta_u[u]) + '\n')
            else:
                predictions.write(u + '-' + i + ',' + str(alpha_4 + beta_u[u]) + '\n')
        else:
            if i in train_item_list:
                predictions.write(u + '-' + i + ',' + str(alpha_4 + beta_i[i]) + '\n')
            else:
                predictions.write(u + '-' + i + ',' + str(alpha_4) + '\n')

    predictions.close()