In [73]:
import numpy as np
import pandas as pd
import gzip
from collections import defaultdict
from sklearn import linear_model
import csv
import random
import scipy
import scipy.optimize

In [74]:
##reading data and forming validation set

def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d['rating']

In [75]:
totalCooked = 0
train_set = []
validation_set = []
allRatings = []
userRatings = defaultdict(list)
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)

for user,recipe,rate in readCSV("trainInteractions.csv.gz"):
    totalCooked += 1
    if totalCooked < 400000:
        usersPerItem[recipe].add(user)
        itemsPerUser[user].add(recipe)
        r = int(rate)
        allRatings.append(r)
        train_set.append((user,recipe,rate))
        userRatings[user].append(r)
    if totalCooked >= 400000:
        validation_set.append((user,recipe,rate))


globalAverage = sum(allRatings) / len(allRatings)

In [76]:
nUsers = len(itemsPerUser)
nItems = len(usersPerItem)
users = list(itemsPerUser.keys())
items = list(usersPerItem.keys())

In [77]:
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

In [78]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [79]:
def prediction(user, item):
    return alpha + userBiases[user] + itemBiases[item]

In [80]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    alpha = theta[0]
    userBiases = dict(zip(users, theta[1:nUsers+1]))
    itemBiases = dict(zip(items, theta[1+nUsers:]))

In [81]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(d[0], d[1]) for d in train_set]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in userBiases:
        cost += lamb*userBiases[u]**2
    for i in itemBiases:
        cost += lamb*itemBiases[i]**2
    return cost

In [82]:
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(train_set)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    for d in train_set:
        u,i = d[0], d[1]
        pred = prediction(u, i)
        diff = pred - int(d[2])
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    return np.array(dtheta)

In [83]:
x,f,d = scipy.optimize.fmin_l_bfgs_b(cost, [globalAverage] + [0.0]*(nUsers+nItems),
                             derivative, args = (allRatings, 1))

MSE = 0.8987331675050297
MSE = 0.8856375816231887
MSE = 0.898597087332774
MSE = 0.8985970389661542


In [86]:
alpha = x[0]
beta_u = x[1:nUsers+1]
beta_i = x[nUsers+1:]
ItemBiases = defaultdict(float)
UserBiases = defaultdict(float)
for i in range(len(users)):
    UserBiases[users[i]] = beta_u[i]
for i in range(len(items)):
    UserBiases[items[i]] = beta_i[i]
    
validation_rates = [int(d[2]) for d in validation_set]
    
predicted = []
for i in range(len(validation_set)):
    user = validation_set[i][0]
    item = validation_set[i][1]
    rate = validation_set[1][2]
    predicted.append(alpha + UserBiases[user] + ItemBiases[item])
print("MSE on validation set = " + str(MSE(predicted,validation_rates)))    

MSE on validation set = 0.9094351485419185


In [87]:
print("Item with maximum bias = " + str(max(ItemBiases, key=ItemBiases.get)))
print("User with maximum bias = " + str(max(UserBiases, key=UserBiases.get)))

Item with maximum bias = 76256724
User with maximum bias = 32445558


In [88]:
lambda_max = 0
min_MSE = 1000
for lambda_ in [0.001,0.01,0.1,1,10,100]:
    x,f,d = scipy.optimize.fmin_l_bfgs_b(cost, [globalAverage] + [0.0]*(nUsers+nItems),
                             derivative, args = (allRatings, lambda_))
    alpha = x[0]
    beta_u = x[1:nUsers+1]
    beta_i = x[nUsers+1:]
    ItemBiases = defaultdict(float)
    UserBiases = defaultdict(float)
    for i in range(len(users)):
        UserBiases[users[i]] = beta_u[i]
    for i in range(len(items)):
        UserBiases[items[i]] = beta_i[i]

    validation_rates = [int(d[2]) for d in validation_set]
    predicted = []
    for i in range(len(validation_set)):
        user = validation_set[i][0]
        item = validation_set[i][1]
        rate = validation_set[1][2]
        predicted.append(alpha + UserBiases[user] + ItemBiases[item])
    MSE_lambda = MSE(predicted,validation_rates)
    if MSE_lambda < min_MSE:
        min_MSE = MSE_lambda
        lambda_max = lambda_
print(min_MSE, lambda_max)    

MSE = 0.8987331675050297
MSE = 0.8856375816231887
MSE = 1.065441094811514
MSE = 0.8838503954709337
MSE = 0.8783967111041078
MSE = 0.8774773515849977
MSE = 0.8739924581177699
MSE = 0.8608699675914993
MSE = 0.8570189699934744
MSE = 0.8538148221884256
MSE = 0.8536452605109054
MSE = 0.8537555501089433
MSE = 0.8537338733728912
MSE = 0.8536777631428745
MSE = 0.8536423013761935
MSE = 0.8536449724100961
MSE = 0.8536446846701096
MSE = 0.8536533746241003
MSE = 0.8536447977761387
MSE = 0.8987331675050297
MSE = 0.8856375816231887
MSE = 0.9657174700227784
MSE = 0.8854124142437872
MSE = 0.8896050307721269
MSE = 0.888668918753176
MSE = 0.8887459295308636
MSE = 0.8887418924805084
MSE = 0.8987331675050297
MSE = 0.8856375816231887
MSE = 0.8974328882737885
MSE = 0.8975045743914509
MSE = 0.8974282740439955
MSE = 0.897428028896924
MSE = 0.8974261102781619
MSE = 0.8987331675050297
MSE = 0.8856375816231887
MSE = 0.898597087332774
MSE = 0.8985970389661542
MSE = 0.8987331675050297
MSE = 0.8856375816231887
MSE 

Lambda value with minimum MSE is 0.001 and the MSE on validation set is 0.88

In [89]:
allRatings = []
userRatings = defaultdict(list)
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)


for user,recipe,rate in readCSV("trainInteractions.csv.gz"):
    usersPerItem[recipe].add(user)
    itemsPerUser[user].add(recipe)
    r = int(rate)
    allRatings.append(r)

globalAverage = sum(allRatings) / len(allRatings)

nUsers = len(itemsPerUser)
nItems = len(usersPerItem)
users = list(itemsPerUser.keys())
items = list(usersPerItem.keys())
    
x,f,d = scipy.optimize.fmin_l_bfgs_b(cost, [globalAverage] + [0.0]*(nUsers+nItems),
                             derivative, args = (allRatings, 0.001))

alpha = x[0]
beta_u = x[1:nUsers+1]
beta_i = x[nUsers+1:]
ItemBiases = defaultdict(float)
UserBiases = defaultdict(float)
for i in range(len(users)):
    UserBiases[users[i]] = beta_u[i]
for i in range(len(items)):
    UserBiases[items[i]] = beta_i[i]
    



MSE = 0.8987331675335475
MSE = 0.885655711462093
MSE = 1.0514168085665376
MSE = 0.8837539762754325
MSE = 0.8783964283627522
MSE = 0.877529926025342
MSE = 0.874233655552501
MSE = 0.8610729138272688
MSE = 0.8571565345207578
MSE = 0.853888986826966
MSE = 0.8536986822582134
MSE = 0.8537676121195551
MSE = 0.8537162316403992
MSE = 0.8536691856933226
MSE = 0.8536405044548784
MSE = 0.8536393477099931
MSE = 0.8536447848480913
MSE = 0.8536399918702993
MSE = 0.8536441752771732
MSE = 0.853644647974439
MSE = 0.8536446551261031


In [90]:
predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if (u not in users) and (i not in items):
        predictions.write(u + '-' + i + ',' + str(alpha) + '\n')
    elif (u not in users):
        predictions.write(u + '-' + i + ',' + str(alpha + ItemBiases[i]) + '\n')
    elif (i not in items):
        predictions.write(u + '-' + i + ',' + str(alpha + UserBiases[u]) + '\n')
    elif (i in items) and (u in users):
        predictions.write(u + '-' + i + ',' + str(alpha + UserBiases[u] + ItemBiases[i]) + '\n')

    

predictions.close()