# Baseline

In [1]:
import gzip
from collections import defaultdict
from sklearn import linear_model
import csv

def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [2]:
reviews = []

for _, _, review in readCSV("data/trainInteractions.csv.gz"):
  reviews.append(review)

train_reviews = reviews[:400000]
valid_reviews = reviews[400000:500000]

In [3]:
valid_dict = defaultdict(int)
for valid_review in valid_reviews:
    user = valid_review['user_id']
    recipe = valid_review['recipe_id']
    rating = valid_review['rating']
    valid_dict[(user, recipe)] =  int(rating)

In [8]:
users_per_recipe = defaultdict(set)
recipes_per_user = defaultdict(set)
train_dict = defaultdict(int)
for train_review in reviews:
    user = train_review['user_id']
    recipe = train_review['recipe_id']
    rating = train_review['rating']
    train_dict[(user, recipe)] =  int(rating)
    recipes_per_user[user].add(recipe)
    users_per_recipe[recipe].add(user)

In [11]:
N = len(train_dict)

In [12]:
def calculate_alpha(data_dict, beta_user_dict, beta_item_dict, N):
    sum = 0
    for (user, item), rating in data_dict.items():
        sum += rating - (beta_user_dict[user] + beta_item_dict[item])
    return sum/N

def calculate_beta_user(data_dict, items_per_user, alpha, beta_item_dict, lamb):
    beta_user_dict = {}
    for user, items in items_per_user.items():
        sum = 0
        for item in items:
            rating = data_dict[(user, item)]
            beta_item = beta_item_dict[item]
            sum += rating - (alpha +beta_item)
        beta_user = sum / (lamb + len(items_per_user[user]))
        beta_user_dict[user] = beta_user
    return beta_user_dict

def calculate_beta_item(data_dict, users_per_item, alpha, beta_user_dict, lamb):
    beta_item_dict = {}
    for item, users in users_per_item.items():
        sum = 0
        for user in users:
            rating = data_dict[(user, item)]
            beta_user = beta_user_dict[user]
            sum += rating - (alpha + beta_user)
        beta_item = sum/(lamb + len(users_per_item[item]))
        beta_item_dict[item] = beta_item
    return beta_item_dict

In [13]:
def MSE(predicted, validation):
    mse = 0
    for (user, recipe), review in predicted.items():
        mse += (review - validation[(user, recipe)])**2
    return mse/len(predicted)

In [14]:
lamb = 10.7
beta_user_dict = {user:0 for user in recipes_per_user.keys()}
beta_recipe_dict = {recipe:0 for recipe in users_per_recipe.keys()}

for i in range(100):
    alpha = calculate_alpha(train_dict, beta_user_dict, beta_recipe_dict, N)
    beta_user_dict = calculate_beta_user(train_dict, recipes_per_user, alpha, beta_recipe_dict, lamb)
    beta_recipe_dict = calculate_beta_item(train_dict, users_per_recipe, alpha, beta_user_dict, lamb)

pred_valid_dict = defaultdict(int)
for user, recipe in valid_dict.keys():
    beta_user = beta_user_dict[user] if user in beta_user_dict else 0
    beta_recipe = beta_recipe_dict[recipe] if recipe in beta_recipe_dict else 0
    prediction = alpha + beta_user + beta_recipe
    if prediction > 5:
        pred_valid_dict[(user, recipe)] = 5
    elif prediction < 0:
        pred_valid_dict[(user, recipe)] = 0
    else:
        pred_valid_dict[(user, recipe)] = prediction


mse = MSE(pred_valid_dict, valid_dict)

print(f"MSE on the validation set with lambda={10.5}: {mse}")

MSE on the validation set with lambda=10.5: 0.6971074688023815


In [32]:
def predict(test_list):
    pred_test_dict = defaultdict(int)
    for (user, recipe) in test_list:
        beta_user = beta_user_dict[user] if user in beta_user_dict else 0
        beta_recipe = beta_recipe_dict[recipe] if recipe in beta_recipe_dict else 0
        prediction = alpha + beta_user + beta_recipe
        if prediction > 5:
            pred_test_dict[(user, recipe)] = 5
        elif prediction < 0:
            pred_test_dict[(user, recipe)] = 0
        elif abs(round(prediction)-prediction) < 0.008:
            pred_test_dict[(user, recipe)] = round(prediction)
        else:
            pred_test_dict[(user, recipe)] = prediction
    return pred_test_dict

In [31]:
header = ""
test_list = []
for line in open("data/stub_Rated.txt"):
    if line.startswith("user_id"):
        header = line
        continue
    user, recipe = line.strip().split('-')
    test_list.append((user, recipe))


pred_test_dict = predict(test_list)

predictions = open("data/predictions_Rated.txt", 'w')
predictions.write(header)
for key, pred in pred_test_dict.items():
    user, recipe = key
    predictions.write(f"{user}-{recipe},{pred}\n")


## Surprise


In [64]:
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("data/trainInteractions.csv.gz")

reader = Reader(line_format='user item rating', sep='\t')
data = Dataset.load_from_df(df[['user_id','recipe_id', 'rating']], reader=reader)

In [192]:
model = SVD()
trainset, validationset = train_test_split(data, test_size=0.2)

In [201]:
model.fit(trainset)
predictions = model.test(validationset)

In [203]:
sse = 0
for p in predictions:
    sse += (p.r_ui - p.est)**2

print(sse / len(predictions))

0.8313158910940747


In [69]:
header = ""
test_list = []
for line in open("data/stub_Rated.txt"):
    if line.startswith("user_id"):
        header = line
        continue
    user, recipe = line.strip().split('-')
    test_list.append((user, recipe))

testset = {(int(user_id), int(recipe_id), 0) for (user_id, recipe_id) in test_list}

pred_test = model.test(testset)

predictions = open("data/predictions_Rated.txt", 'w')
predictions.write(header)
for elem in pred_test:
    user, recipe, pred = str(elem.uid), str(elem.iid), elem.est
    user = (8 - len(user))*"0" + user
    recipe = (8 - len(recipe))*"0" + recipe
    predictions.write(f"{user}-{recipe},{pred}\n")

## Latent factor model (Tensorflow)

In [70]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

In [85]:
df = pd.read_csv("data/trainInteractions.csv.gz")

users = [d for d in df['user_id']]
recipes = [d for d in df['recipe_id']]
ratings = [d for d in df['rating']]

userIDs = {}
recipeIDs = {}
interactions = []

for user, recipe, rating in zip(users, recipes, ratings):
    if not user in userIDs: userIDs[user] = len(userIDs)
    if not recipe in recipeIDs: recipeIDs[recipe] = len(recipeIDs)
    interactions.append((user,recipe,rating))

In [87]:
import random
random.shuffle(interactions)
len(interactions)

500000

In [88]:
train_interactions = interactions[:400000]
valid_interactions = interactions[400000:]

In [89]:
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)
for u,i,r in train_interactions:
    itemsPerUser[u].append(i)
    usersPerItem[i].append(u)

In [90]:
mu = sum([r for _,_,r in train_interactions]) / len(train_interactions)

In [91]:
import tensorflow as tf
optimizer = tf.keras.optimizers.Adam(0.1)

2021-11-12 22:58:11.522213: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-12 22:58:11.522242: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [98]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(recipeIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(recipeIDs),K],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [99]:
modelLFM = LatentFactorModel(mu, 5, 0.00001)

In [100]:
def trainingStep(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(recipeIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [101]:
for i in range(100):
    obj = trainingStep(modelLFM, train_interactions)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.4000813
iteration 20, objective = 0.3919067
iteration 30, objective = 0.39466333
iteration 40, objective = 0.39993954
iteration 50, objective = 0.40726262
iteration 60, objective = 0.40791407
iteration 70, objective = 0.39146918
iteration 80, objective = 0.40565452
iteration 90, objective = 0.39377075
iteration 100, objective = 0.3961101


In [107]:
pred_valid_dict = defaultdict(int)
i = 0
for user, recipe, rating in valid_interactions:
    pred = modelLFM.predict(userIDs[user], recipeIDs[recipe]).numpy()
    pred_valid_dict[(user, recipe)] = pred
    i+=1



10000
20000
30000
40000
50000
60000
70000
80000
90000
100000


In [108]:
valid_dict = defaultdict(int)
for user, recipe, rating in valid_interactions:
    valid_dict[(user, recipe)] = rating

In [109]:
mse = MSE(pred_valid_dict, valid_dict)
mse

0.8485252041790833

In [110]:
factors = [4, 6, 7, 8]
rates = [0.000001 ,0.000005, 0.000015, 0.00002]

for factor in factors:
    for rate in rates:
        modelLFM = LatentFactorModel(mu, factor, rate)

        for i in range(100):
            obj = trainingStep(modelLFM, train_interactions)

        pred_valid_dict = defaultdict(int)
        for user, recipe, rating in valid_interactions:
            pred = modelLFM.predict(userIDs[user], recipeIDs[recipe]).numpy()
            pred_valid_dict[(user, recipe)] = pred

        mse = MSE(pred_valid_dict, valid_dict)

        print(f"MSE with factor={factor} and rate={rate}: {mse}")

MSE with factor=4 and rate=1e-06: 1.0251667164897809
MSE with factor=4 and rate=5e-06: 0.8960743558739334
MSE with factor=4 and rate=1.5e-05: 0.8397884594729242
MSE with factor=4 and rate=2e-05: 0.8363532631079387
MSE with factor=6 and rate=1e-06: 1.0182774160346437
MSE with factor=6 and rate=5e-06: 0.903168768575282
MSE with factor=6 and rate=1.5e-05: 0.8443399480144281
MSE with factor=6 and rate=2e-05: 0.8395638914215693
MSE with factor=7 and rate=1e-06: 1.0346710795690381
MSE with factor=7 and rate=5e-06: 0.9012724151087284
MSE with factor=7 and rate=1.5e-05: 0.8508002665567341
MSE with factor=7 and rate=2e-05: 0.842742691194159
MSE with factor=8 and rate=1e-06: 1.0586551410210274
MSE with factor=8 and rate=5e-06: 0.9027758789650924
MSE with factor=8 and rate=1.5e-05: 0.853021465336339
MSE with factor=8 and rate=2e-05: 0.8478977345741846


In [112]:
factors = [4]
rates = [0.000025]

for factor in factors:
    for rate in rates:
        modelLFM = LatentFactorModel(mu, factor, rate)

        for i in range(100):
            obj = trainingStep(modelLFM, train_interactions)

        pred_valid_dict = defaultdict(int)
        for user, recipe, rating in valid_interactions:
            pred = modelLFM.predict(userIDs[user], recipeIDs[recipe]).numpy()
            pred_valid_dict[(user, recipe)] = pred

        mse = MSE(pred_valid_dict, valid_dict)

        print(f"MSE with factor={factor} and rate={rate}: {mse}")

MSE with factor=4 and rate=2.5e-05: 0.844711193449904


## Complete latent factor model

In [217]:
df = pd.read_csv("data/trainInteractions.csv.gz")

users = [d for d in df['user_id']]
recipes = [d for d in df['recipe_id']]
ratings = [d for d in df['rating']]

rating_mean = sum(ratings)/len(ratings)
always_predict_mean = [rating_mean]*len(train_reviews)

In [218]:
import gzip
from collections import defaultdict
from sklearn import linear_model
import csv

def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [219]:
reviews = []

for _, _, review in readCSV("data/trainInteractions.csv.gz"):
  reviews.append(review)

train_reviews = reviews[:400000]
valid_reviews = reviews[400000:500000]

In [220]:
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

userGamma = {}
itemGamma = {}

In [221]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for review in train_reviews:
    user,recipe = review['user_id'], review['recipe_id']
    reviewsPerUser[user].append(review)
    reviewsPerItem[recipe].append(review)

In [222]:
K = 2

for u in reviewsPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]

for i in reviewsPerItem:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [223]:
nUsers = len(reviewsPerUser)
nItems = len(reviewsPerItem)
users = list(reviewsPerUser.keys())
items = list(reviewsPerItem.keys())

In [224]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K

In [225]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [226]:
def pred(user, item):
    return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])

In [227]:
#predictions = [pred(d['user_id'], d['recipe_id']) for d in train_reviews]


In [228]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [pred(d['user_id'], d['recipe_id']) for d in train_reviews]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost

In [229]:
import numpy

def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(train_reviews)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in reviewsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in reviewsPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for d in train_reviews:
        u,i = d['user_id'], d['recipe_id']
        pred = prediction(u, i)
        diff = pred - int(d['rating'])
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

In [230]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [231]:

MSE(always_predict_mean, labels)

0.8987313600348047

In [233]:
import scipy

labels = [int(d['rating']) for d in train_reviews]

scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                   [0.0]*(nUsers+nItems) + # Initialize beta
                                   [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                             derivative, args = (labels, 10.5))

MSE = 0.9000736097431962
MSE = 0.9000565408868471
MSE = 0.8999895138533285
MSE = 0.8996174282145899
MSE = 0.8994542761162203
MSE = 0.8989648773961573
MSE = 0.8987183355329932
MSE = 0.898718373357114
MSE = 0.8987183391339414


(array([ 4.58078847e+00, -8.19672958e-06, -7.68353151e-07, ...,
        -1.52378259e-14,  2.30762908e-14, -1.43155992e-13]),
 0.89872484856411,
 {'grad': array([ 4.64529462e-07,  7.54267441e-11, -3.60914257e-12, ...,
         -3.19936130e-13,  4.84600660e-13, -3.00627249e-12]),
  'task': 'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
  'funcalls': 9,
  'nit': 5,
  'warnflag': 0})