In [1]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
import numpy
import random
import pandas as pd
import itertools
from matplotlib import pyplot as plt

In [2]:
def readGz(path):
    for l in gzip.open(path, "rt"):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, "rt")
    f.readline()
    for l in f:
        yield l.strip().split(",")

In [3]:
data = []

for user, book, rating in readCSV("train_Interactions.csv.gz"):
    data.append([user, book, rating])

train_data = data[:190000]
valid_data = data[190000:]

In [4]:
ratingsPerUser = defaultdict(list)
ratingsPerBook = defaultdict(list)

for user, book, rating in train_data:
    ratingsPerUser[user].append(rating)
    ratingsPerBook[book].append(rating)

In [5]:
N = len(train_data)
nUsers = len(ratingsPerUser)
nBooks = len(ratingsPerBook)
users = list(ratingsPerUser.keys())
books = list(ratingsPerBook.keys())

In [6]:
ratingMean = sum([int(d[2]) for d in train_data]) / N

alpha = ratingMean

In [7]:
userBiases = defaultdict(float)
bookBiases = defaultdict(float)

In [8]:
def MSE(predictions, labels):
    J = sum((predictions - labels) ** 2)/ len(labels)
    return J

In [9]:
def prediction(user, book):
    if user in userBiases and book in bookBiases:
        return alpha + userBiases[user] + bookBiases[book]
    elif user in bookBiases and book not in bookBiases:
        return alpha + userBiases[user]
    elif user not in bookBiases and book in bookBiases:
        return alpha + bookBiases[book]
    else:
        return alpha

In [10]:
def unpack(theta):
    global alpha
    global userBiases
    global bookBiases
    alpha = theta[0]
    userBiases = dict(zip(users, theta[1 : nUsers + 1]))
    bookBiases = dict(zip(books, theta[1 + nUsers :]))

In [11]:
def cost(theta, labels, lamb, l1, data):
    unpack(theta)
    predictions = [prediction(user, book) for user, book, _ in data]
    cost = MSE(numpy.array(predictions), numpy.array(labels))
    print("MSE = " + str(cost))
    #for u in userBiases:
     #   cost += lamb * userBiases[u] ** 2
    #for i in bookBiases:
    #    cost += lamb * bookBiases[i] ** 2
    cost += lamb*(1 - l1)*(sum(numpy.array(list(userBiases.values())) ** 2) + sum(numpy.array(list(bookBiases.values())) ** 2))
    cost += lamb * l1 * (sum(numpy.abs(numpy.array(list(userBiases.values())))) + sum(numpy.abs(numpy.array(list(bookBiases.values())))))
    return cost

In [12]:
sign = lambda x : numpy.copysign(1, x)

In [13]:
def derivative(theta, labels, lamb, l1, data):
    unpack(theta)
    N = len(data)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dBookBiases = defaultdict(float)
    for user, book, rating in data:
        pred = prediction(user, book)
        diff = pred - float(rating)
        dalpha += 2 / N * diff
        dUserBiases[user] += 2 / N * diff
        dBookBiases[book] += 2 / N * diff
    
    for user in userBiases:
        dUserBiases[user] += 2 * lamb * (1 - l1) * userBiases[user] + lamb * l1 * sign(userBiases[user])
        
    for book in bookBiases:
        dBookBiases[book] += 2 * lamb * (1 - l1) * bookBiases[book] + lamb * l1 * sign(bookBiases[book])
    
    dtheta =  [dalpha] + [dUserBiases[u] for u in users] + [dBookBiases[b] for b in books]
        
    return numpy.array(dtheta)

In [14]:
train_labels = [float(d[2]) for d in train_data]
train_alwaysPredictMean = [float(alpha) for d in train_data]

In [15]:
MSE(numpy.array(train_alwaysPredictMean), numpy.array(train_labels))

1.4735475011336192

In [20]:
theta = scipy.optimize.fmin_l_bfgs_b(cost,[alpha] + [0.0] * (nUsers + nBooks), derivative,args=(train_labels, 0.00001, 0.25, train_data))

MSE = 1.4814337548468803
MSE = 2.293363042210713
MSE = 1.4733842256204268
MSE = 1.473227103721692
MSE = 1.4725990813230208
MSE = 1.4700944348665983
MSE = 1.46019493925109
MSE = 1.422502400155635
MSE = 1.1920321823433524
MSE = 1.0652601296486968
MSE = 1.0556963211312334
MSE = 1.0271727365580325
MSE = 1.0100593263853004
MSE = 0.974624655744501
MSE = 0.9614706154715367
MSE = 0.9610117515322717
MSE = 0.9609499302934326
MSE = 0.9607453651108362
MSE = 0.9600801807950957
MSE = 0.9584255663030808
MSE = 0.9546043429224209
MSE = 0.9484265146077718
MSE = 2.443036789965412
MSE = 0.9485763956595791
MSE = 0.9429558474438441
MSE = 0.9408678320692607
MSE = 0.9407037166985477
MSE = 0.9407123173393946
MSE = 0.9403278124625067
MSE = 1.5795306684235078
MSE = 0.9409995931607942
MSE = 0.9400813076391846
MSE = 0.9936147202541333
MSE = 0.9371427041881008
MSE = 0.9328843868809991
MSE = 0.9288125865276382
MSE = 0.9276648001549389
MSE = 0.9281207843889614
MSE = 0.9276840862399007
MSE = 0.9275826037372401
MSE = 0

In [21]:
unpack(theta[0])
valid_labels = [float(d[2]) for d in valid_data]
valid_predictions = [prediction(u, b) for u,b,_ in valid_data]
print(MSE(numpy.array(valid_predictions), numpy.array(valid_labels)))
theta_model1 = theta[0]

1.1100949524615094


In [23]:
max_ub = (max(userBiases, key=userBiases.get), max(userBiases.values()))
min_ub = (min(userBiases, key=userBiases.get), min(userBiases.values()))
max_bb = (max(bookBiases, key=bookBiases.get), max(bookBiases.values()))
min_bb = (min(bookBiases, key=bookBiases.get), min(bookBiases.values()))

summary_table = [max_ub, min_ub, max_bb, min_bb]

pd.DataFrame(
        summary_table,
        columns=["ID", "Value"],
        index=["Max.User Bias", "Min.User Bias", "Max.Book Bias", "Min.Book Bias"],
)

Unnamed: 0,ID,Value
Max.User Bias,u81539151,1.228056
Min.User Bias,u48313610,-3.671815
Max.Book Bias,b19925500,1.309103
Min.Book Bias,b84091840,-1.644747


In [None]:
def parameter_tuning(hyperparameter_list):
    global alpha
    global userBiases
    global bookBiases
    alpha = ratingMean
    userBiases = defaultdict(float)
    bookBiases = defaultdict(float)
    training_MSE = []
    valid_MSE = []
    for lamb, l1 in hyperparameter_list:
        print("For hyperparameter")
        theta = scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + [0.0]*(nUsers+nBooks),
                derivative, args = (train_labels, lamb, l1, train_data))
        unpack(theta[0])
        training_MSE.append(theta[1])
        valid_labels = [float(d[2]) for d in valid_data]
        valid_predictions = [prediction(u, b) for u,b,_ in valid_data]
        valid_MSE.append(MSE(numpy.array(valid_predictions), numpy.array(valid_labels)))
    MSE_table = {'Hyperparameter': hyperparameter_list, 'Training MSE': training_MSE, 'Validation MSE': valid_MSE}
    return pd.DataFrame(MSE_table)

In [None]:
hyperparameter_list = [0.000010, 0.000005, 0.000020, 0.000015]
l1_strengths = [0]

thresholds_criteria = list(itertools.product(hyperparameter_list, l1_strengths))

parameter_tuning(thresholds_criteria)

In [24]:
userGamma = {}
bookGamma = {}

K = 5
for u in ratingsPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
for b in ratingsPerBook:
    bookGamma[b] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [25]:
def prediction(user, book):
    if user in userBiases and book in bookBiases:
        return alpha + userBiases[user] + bookBiases[book] + inner(userGamma[user], bookGamma[book])
    elif user in bookBiases and book not in bookBiases:
        return alpha + userBiases[user]
    elif user not in bookBiases and book in bookBiases:
        return alpha + bookBiases[book]
    else:
        return alpha

In [26]:
def unpack(theta):
    global alpha
    global userBiases
    global bookBiases
    global userGamma
    global bookGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index : index + nUsers]))
    index += nUsers
    bookBiases = dict(zip(books, theta[index : index + nBooks]))
    index += nBooks
    for u in users:
        userGamma[u] = theta[index : index + K]
        index += K
    for b in books:
        bookGamma[b] = theta[index : index + K]
        index += K

In [27]:
def inner(x, y):
    return sum([a * b for a, b in zip(x, y)])

In [28]:
def cost(theta, labels, lamb1, lamb2, l1, data):
    unpack(theta)
    predictions = [prediction(user, book) for user, book, _ in data]
    cost = MSE(numpy.array(predictions), numpy.array(labels))
    print("MSE = " + str(cost))
    #for u in users:
     #   cost += lamb * userBiases[u] ** 2
      #  for k in range(K):
       #     cost += lamb * userGamma[u][k] ** 2
    #for b in books:
     #   cost += lamb * bookBiases[b] ** 2
      #  for k in range(K):
       #     cost += lamb * bookGamma[b][k] ** 2
    cost += lamb1 * (1 - l1)* (sum(numpy.array(list(userBiases.values())) ** 2) + sum(numpy.array(list(bookBiases.values())) ** 2))
    cost += lamb1 * l1 * (sum(numpy.abs(numpy.array(list(userBiases.values())))) + sum(numpy.abs(numpy.array(list(bookBiases.values())))))
    cost += lamb2 * (sum(sum(numpy.abs(numpy.array(list(userGamma.values()))))) + sum(sum(numpy.abs(numpy.array(list(bookGamma.values()))))))
    return cost

In [30]:
def derivative(theta, labels, lamb1, lamb2, l1, data):
    unpack(theta)
    N = len(data)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dBookBiases = defaultdict(float)
    dUserGamma = {}
    dBookGamma = {}
    for u in ratingsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for b in ratingsPerBook:
        dBookGamma[b] = [0.0 for k in range(K)]
    for user, book, rating in data:
        pred = prediction(user, book)
        diff = pred - float(rating)
        dalpha += 2 / N * diff
        dUserBiases[user] += 2 / N * diff
        dBookBiases[book] += 2 / N * diff
        for k in range(K):
            dUserGamma[user][k] += 2 / N * bookGamma[book][k] * diff
            dBookGamma[book][k] += 2 / N * userGamma[user][k] * diff
    for u in userBiases:
        dUserBiases[u] += 2 * lamb1 * (1 - l1) * userBiases[u] + lamb1 * l1 * sign(userBiases[u])
        for k in range(K):
            dUserGamma[u][k] +=  lamb2 * sign(userGamma[u][k])
    for b in bookBiases:
        dBookBiases[b] += 2 * lamb1 * (1 - l1) * bookBiases[b] + lamb1 * l1 * sign(bookBiases[b])
        for k in range(K):
            dBookGamma[b][k] += lamb2 * sign(bookGamma[b][k])
    dtheta = (
        [dalpha] + [dUserBiases[u] for u in users] + [dBookBiases[b] for b in books]
    )
    for u in users:
        dtheta += dUserGamma[u]
    for b in books:
        dtheta += dBookGamma[b]
    return numpy.array(dtheta)

In [31]:
train_labels = [float(d[2]) for d in train_data]
train_alwaysPredictMean = [float(alpha) for d in train_data]
MSE(numpy.array(train_alwaysPredictMean), numpy.array(train_labels))

1.4790475125391764

In [None]:
theta = scipy.optimize.fmin_l_bfgs_b(
    cost,
    list(theta_model1)
    + [random.random() * 0.1 - 0.05 for k in range(K * (nUsers + nBooks))],
    derivative,
    args=(train_labels, 0.000009, 0.000008, 0, train_data), maxfun = 40, maxiter = 40
)

MSE = 0.908652222207209
MSE = 0.9085914689283121


In [None]:
unpack(theta[0])
valid_labels = [float(d[2]) for d in valid_data]
valid_predictions = [prediction(u, b) for u, b, _ in valid_data]
MSE(numpy.array(valid_predictions), numpy.array(valid_labels))

In [None]:
def parameter_tuning(hyperparameter_list):
    global alpha
    global userBiases
    global bookBiases
    global userGamma
    global bookGamma
    
    alpha = ratingMean
    userBiases = defaultdict(float)
    bookBiases = defaultdict(float)
    userGamma = {}
    bookGamma = {}
    
    K = 2
    for u in ratingsPerUser:
        userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
    for b in ratingsPerBook:
        bookGamma[b] = [random.random() * 0.1 - 0.05 for k in range(K)]
    
    training_MSE = []
    valid_MSE = []
    for lamb2 in hyperparameter_list:
        print("For hyperparameter")
        theta = scipy.optimize.fmin_l_bfgs_b(
            cost,
            list(theta_model1) + [random.random() * 0.1 - 0.05 for k in range(K * (nUsers + nBooks))],
            derivative,
            args=(train_labels, 0.00001, lamb2, 0.6, train_data), maxfun = 40, maxiter = 40
        )
        unpack(theta[0])
        training_MSE.append(theta[1])
        valid_labels = [float(d[2]) for d in valid_data]
        valid_predictions = [prediction(u, b) for u, b, _ in valid_data]
        valid_MSE.append(MSE(numpy.array(valid_predictions), numpy.array(valid_labels)))
    MSE_table = {
        "Hyperparameter": hyperparameter_list,
        "Training MSE": training_MSE,
        "Validation MSE": valid_MSE,
    }
    return pd.DataFrame(MSE_table)

In [None]:
hyperparameter_list = [1e-5, 1e-6, 0.00000078, 0.0000090, 0.000006]

parameter_tuning(hyperparameter_list)

In [22]:
predictions = open("predictions_Rating.txt", "w")

for l in open("pairs_Rating.txt"):
    if l.startswith("userID"):
        # header
        predictions.write(l)
        continue
    u, b = l.strip().split("-")
    predictions.write(u + "-" + b + "," + str(prediction(u, b)) + "\n")

predictions.close()