In [36]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model
import csv

In [37]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [38]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [39]:
allRatings = []
userRatings = defaultdict(list)
itemRatings = defaultdict(list)
whole_data = []
ratingDict = {}
validRatingDict = {}
recipesPerUser = defaultdict(list)
usersPerRecipe = defaultdict(list)

for user,recipe,d in readCSV("assignment1/trainInteractions.csv.gz"):
    whole_data.append((user, recipe, int(d['rating'])))

train = whole_data[:400000]
test = whole_data[400000:]

for u,r,rate in whole_data:
    allRatings.append(rate)
    userRatings[u].append(rate)
    itemRatings[r].append(rate)
    ratingDict[(u,r)] = rate
    recipesPerUser[u].append(r)
    usersPerRecipe[r].append(u)
    
for u,r,rate in test:
    validRatingDict[(u,r)] = rate

In [40]:
def iterate(alpha, beta_u, beta_i, L): 
    Ntrain = len(train)
    alpha_num = 0
    for u,i,d in train:
            alpha_num += ratingDict[(u,i)] - (beta_u[u] + beta_i[i])
    alpha_new = alpha_num/Ntrain
    beta_u_new = {}
    beta_i_new = {}
    
    for u in recipesPerUser:
        beta_u_denom = L + len(recipesPerUser[u]) 
        beta_u_num = 0
        for i in recipesPerUser[u]:
            beta_u_num += ratingDict[(u,i)] - (alpha_new + beta_i[i])
        beta_u_new[u] = beta_u_num/beta_u_denom
    for i in usersPerRecipe:
        beta_i_denom = L + len(usersPerRecipe[i]) 
        beta_i_num = 0
        for u in usersPerRecipe[i]:
            beta_i_num += ratingDict[(u,i)] - (alpha_new + beta_u[u])
        beta_i_new[i] = beta_i_num/beta_i_denom
    return alpha_new, beta_u_new, beta_i_new

In [41]:
def initialize(): 
    alpha = 3 
    beta_u = {} 
    beta_i = {}
    for u in recipesPerUser: 
        beta_u[u] = 3
    for i in usersPerRecipe: 
        beta_i[i] = 3
    return alpha, beta_u, beta_i

In [42]:
def getValidMSE(alpha, beta_u, beta_i): 
    se = 0
    for u,i in validRatingDict: 
        f_ui = alpha
        if u in beta_u:
            f_ui += beta_u[u]
        if i in beta_i:
            f_ui += beta_i[i]
        se += (validRatingDict[(u,i)] - f_ui)**2 
        rmse = se/len(validRatingDict)
    return rmse

def getTrainMSE(alpha, beta_u, beta_i): 
    se = 0
    for u,i in ratingDict:
        f_ui = alpha + beta_u[u] + beta_i[i] 
        se += (ratingDict[(u,i)] - f_ui)**2

    rmse = se/len(ratingDict) 
    return rmse

In [43]:
def iterateWrapper(L):
    alpha, beta_u, beta_i = initialize()
    mse = getTrainMSE(alpha, beta_u, beta_i) 
    diff = 500
    while diff >= 0.0001:
        alpha_new, beta_u_new, beta_i_new = iterate(alpha, beta_u, beta_i, L)
        mse_new = getTrainMSE(alpha_new, beta_u_new, beta_i_new)
        diff = abs(mse - mse_new)
        mse = mse_new
        print("MSE: ", mse)
        validMSE = getValidMSE(alpha_new, beta_u_new, beta_i_new)
        print("Validation MSE: ", validMSE)
        alpha, beta_u, beta_i = alpha_new, beta_u_new, beta_i_new
    return alpha, beta_u, beta_i, mse, validMSE

In [46]:
alpha, beta_u, beta_i, mse, validMSE = iterateWrapper(L=11)

MSE:  6.287727589149299
Validation MSE:  6.279083287838342
MSE:  2.0739771595118595
Validation MSE:  2.070305972738066
MSE:  1.1726468147935722
Validation MSE:  1.172659570584092
MSE:  0.9588857905911344
Validation MSE:  0.961081714679422
MSE:  0.8505280968146863
Validation MSE:  0.8539737555670859
MSE:  0.7982677343606122
Validation MSE:  0.8025290548472335
MSE:  0.7639259817244417
Validation MSE:  0.768742271427195
MSE:  0.7422340504651502
Validation MSE:  0.7474542760373222
MSE:  0.7271774671876241
Validation MSE:  0.732703759747769
MSE:  0.7168402213108115
Validation MSE:  0.7226040319306197
MSE:  0.7095524278344598
Validation MSE:  0.7155060117474009
MSE:  0.7044282617178887
Validation MSE:  0.7105345122117739
MSE:  0.7008020731237313
Validation MSE:  0.7070333658400045
MSE:  0.698240960967449
Validation MSE:  0.7045748919502791
MSE:  0.696432490026909
Validation MSE:  0.7028515422756826
MSE:  0.6951595091781869
Validation MSE:  0.701649245485844
MSE:  0.6942665472219941
Validatio

In [47]:
predictions = open("predictions_Rated.txt", 'w') 
for l in open("assignment1/stub_Rated.txt"):
    if l.startswith("user_id"): #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-') 
    f_ui = alpha
    if u in beta_u:
        f_ui += beta_u[u]

    if i in beta_i:
        f_ui += beta_i[i]
    predictions.write(u + '-' + i + ',' + str(f_ui) + '\n') 
predictions.close()