Loading libraries

In [None]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
import numpy
import random
import pandas as pd
from matplotlib import pyplot as plt

Defining functions to extract zip, and extract data.

In [None]:
def readGz(path):
    for l in gzip.open(path, "rt"):
        yield eval(l)


def readCSV(path):
    f = gzip.open(path, "rt")
    f.readline()
    for l in f:
        yield l.strip().split(",")

Collecting ratings average, user ratings average

Question 9, Defining the data structure to build a model to predict rating based on a latent factor model using the training rating data

In [None]:
data = []

for user, book, rating in readCSV("train_Interactions.csv.gz"):
    data.append([user, book, rating])

train_data = data[:190000]
valid_data = data[190000:]

In [None]:
ratingsPerUser = defaultdict(list)
ratingsPerBook = defaultdict(list)

for user, book, rating in train_data:
    ratingsPerUser[user].append(rating)
    ratingsPerBook[book].append(rating)

In [None]:
N = len(train_data)
nUsers = len(ratingsPerUser)
nBooks = len(ratingsPerBook)
users = list(ratingsPerUser.keys())
books = list(ratingsPerBook.keys())

In [None]:
ratingMean = sum([int(d[2]) for d in train_data]) / N

alpha = ratingMean

In [None]:
userBiases = defaultdict(float)
bookBiases = defaultdict(float)

In [None]:
userGamma = {}
bookGamma = {}

K = 2
for u in ratingsPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
for b in ratingsPerBook:
    bookGamma[b] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [None]:
def MSE(predictions, labels):
    differences = [(x - y) ** 2 for x, y in zip(predictions, labels)]
    return sum(differences) / len(differences)

In [None]:
def prediction(user, book):
    if user in userBiases and book in bookBiases:
        return (
            alpha
            + userBiases[user]
            + bookBiases[book]
            + inner(userGamma[user], bookGamma[book])
        )
    elif user in bookBiases and book not in bookBiases:
        return alpha + userBiases[user]
    elif user not in bookBiases and book in bookBiases:
        return alpha + bookBiases[book]
    else:
        return alpha

In [None]:
def unpack(theta):
    global alpha
    global userBiases
    global bookBiases
    global userGamma
    global bookGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index : index + nUsers]))
    index += nUsers
    bookBiases = dict(zip(books, theta[index : index + nBooks]))
    index += nBooks
    for u in users:
        userGamma[u] = theta[index : index + K]
        index += K
    for b in books:
        bookGamma[b] = theta[index : index + K]
        index += K

In [None]:
def inner(x, y):
    return sum([a * b for a, b in zip(x, y)])

In [None]:
def cost(theta, labels, lamb, data):
    unpack(theta)
    predictions = [prediction(user, book) for user, book, _ in data]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb * userBiases[u] ** 2
        for k in range(K):
            cost += lamb * userGamma[u][k] ** 2
    for b in books:
        cost += lamb * bookBiases[b] ** 2
        for k in range(K):
            cost += lamb * bookGamma[b][k] ** 2
    return cost

In [None]:
def derivative(theta, labels, lamb, data):
    unpack(theta)
    N = len(data)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dBookBiases = defaultdict(float)
    dUserGamma = {}
    dBookGamma = {}
    for u in ratingsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for b in ratingsPerBook:
        dBookGamma[b] = [0.0 for k in range(K)]
    for user, book, rating in data:
        pred = prediction(user, book)
        diff = pred - float(rating)
        dalpha += 2 / N * diff
        dUserBiases[user] += 2 / N * diff
        dBookBiases[book] += 2 / N * diff
        for k in range(K):
            dUserGamma[user][k] += 2 / N * bookGamma[book][k] * diff
            dBookGamma[book][k] += 2 / N * userGamma[user][k] * diff
    for u in userBiases:
        dUserBiases[u] += 2 * lamb * userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2 * lamb * userGamma[u][k]
    for b in bookBiases:
        dBookBiases[b] += 2 * lamb * bookBiases[b]
        for k in range(K):
            dBookGamma[b][k] += 2 * lamb * bookGamma[b][k]
    dtheta = (
        [dalpha] + [dUserBiases[u] for u in users] + [dBookBiases[b] for b in books]
    )
    for u in users:
        dtheta += dUserGamma[u]
    for b in books:
        dtheta += dBookGamma[b]
    return numpy.array(dtheta)

In [16]:
train_labels = [float(d[2]) for d in data]
train_alwaysPredictMean = [float(alpha) for d in data]

In [17]:
MSE(train_alwaysPredictMean, train_labels)

1.4744156039668792

In [19]:
theta = scipy.optimize.fmin_l_bfgs_b(
    cost,
    [alpha]
    + [0.0] * (nUsers + nBooks)
    + [random.random() * 0.1 - 0.05 for k in range(K * (nUsers + nBooks))],
    derivative,
    args=(train_labels, 1, train_data), maxfun = 75, maxiter = 75
)

MSE = 1.473552588725499
MSE = 1.4735223480936397
MSE = 1.4733899542381623


In [20]:
unpack(theta[0])
valid_labels = [float(d[2]) for d in valid_data]
valid_predictions = [prediction(u, b) for u, b, _ in valid_data]
MSE(valid_predictions, valid_labels)

1.4907801137874852

Using the latent-factor model with a lambda of 1, for the validation set we get a MSE of 1.4907

Question 10, the max-min Bias values|Ids of User and Book for the model we trained using training data.

In [21]:
max_ub = (max(userBiases, key=userBiases.get), max(userBiases.values()))
min_ub = (min(userBiases, key=userBiases.get), min(userBiases.values()))
max_bb = (max(bookBiases, key=bookBiases.get), max(bookBiases.values()))
min_bb = (min(bookBiases, key=bookBiases.get), min(bookBiases.values()))

summary_table = [max_ub, min_ub, max_bb, min_bb]

pd.DataFrame(
    summary_table,
    columns=["ID", "Value"],
    index=["Max.User Bias", "Min.User Bias", "Max.Book Bias", "Min.Book Bias"],
)

Unnamed: 0,ID,Value
Max.User Bias,u92864068,0.000404
Min.User Bias,u11591742,-0.00158
Max.Book Bias,b76915592,0.000829
Min.Book Bias,b57299824,-0.000272


Question 11, Choosing form a range of hyperpataers to tune model to best performance (lowest MSE)

In [22]:
def parameter_tuning(hyperparameter_list):
    global alpha
    global userBiases
    global bookBiases
    global userGamma
    global bookGamma
    
    alpha = ratingMean
    userBiases = defaultdict(float)
    bookBiases = defaultdict(float)
    userGamma = {}
    bookGamma = {}
    
    K = 2
    for u in ratingsPerUser:
        userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
    for b in ratingsPerBook:
        bookGamma[b] = [random.random() * 0.1 - 0.05 for k in range(K)]
    
    training_MSE = []
    valid_MSE = []
    for lamb in hyperparameter_list:
        print("For hyperparameter")
        theta = scipy.optimize.fmin_l_bfgs_b(
            cost,
            [alpha] + [0.0] * (nUsers + nBooks) + [random.random() * 0.1 - 0.05 for k in range(K * (nUsers + nBooks))],
            derivative,
            args=(train_labels, lamb, train_data), maxfun = 75, maxiter = 75
        )
        unpack(theta[0])
        training_MSE.append(theta[1])
        valid_labels = [float(d[2]) for d in valid_data]
        valid_predictions = [prediction(u, b) for u, b, _ in valid_data]
        valid_MSE.append(MSE(valid_predictions, valid_labels))
    MSE_table = {
        "Hyperparameter": hyperparameter_list,
        "Training MSE": training_MSE,
        "Validation MSE": valid_MSE,
    }
    return pd.DataFrame(MSE_table)

In [23]:
hyperparameter_list = [0.01, 1, 100]

parameter_tuning(hyperparameter_list)

For hyperparameter
MSE = 1.473542867684955
MSE = 1.4707492784354008
MSE = 1.524751077429654
MSE = 1.664430648359679
MSE = 1.4571072357488954
MSE = 1.458424836696519
MSE = 1.4584194341918297
MSE = 1.4584193576772098
MSE = 1.4584210732917318
MSE = 1.4584224633044278
For hyperparameter
MSE = 1.4735559183679785
MSE = 1.4735244758416304
MSE = 1.4733899552153853
For hyperparameter
MSE = 1.473558912755138
MSE = 1.4735545906277125
MSE = 1.4735459256151198
MSE = 1.473545925599727
MSE = 1.4735459255395476


Unnamed: 0,Hyperparameter,Training MSE,Validation MSE
0,0.01,1.465883,1.478535
1,1.0,1.473469,1.49078
2,100.0,1.473547,1.490908


The MSE for the training and validation set is generalized at lambda value 0.00001 from the different training lambda performances summarized above. Using this lambda, model we predict on test set. The solution has been uploaded to kaggle (Username: mouserat)

In [32]:
predictions = open("predictions_Rating.txt", "w")

for l in open("pairs_Rating.txt"):
    if l.startswith("userID"):
        # header
        predictions.write(l)
        continue
    u, b = l.strip().split("-")
    predictions.write(u + "-" + b + "," + str(prediction(u, b)) + "\n")

predictions.close()