## Assignment 1 

#### Rating Prediction

In [1]:
import numpy
import urllib
import scipy.optimize
import random
import gzip
import csv
from collections import defaultdict
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [2]:
# input file using gzip
path = "train_Interactions.csv.gz"
f = gzip.open(path, "rt", encoding="utf8")
reader = csv.reader(f, delimiter=",")

In [3]:
# reading the file to build dataset
dataset = []
first = True
for line in reader:
    if first:
        header = line
        first = False
    else:
        d = dict(zip(header, line))
        # convert strings to integers for some fields
        d["rating"] = int(d["rating"])
        dataset.append(d)

In [4]:
# split the training data
data_train = dataset[:190000]
data_valid = dataset[190000:]

In [5]:
# computer the global mean
globalAverage = sum([d["rating"] for d in dataset]) / len(dataset)
globalAverage

3.896755

In [5]:
data_valid

[{'userID': 'u35176258', 'bookID': 'b30592470', 'rating': 3},
 {'userID': 'u30851063', 'bookID': 'b81941226', 'rating': 3},
 {'userID': 'u31368414', 'bookID': 'b40097012', 'rating': 5},
 {'userID': 'u71352502', 'bookID': 'b25118404', 'rating': 2},
 {'userID': 'u46986025', 'bookID': 'b89866434', 'rating': 3},
 {'userID': 'u77839057', 'bookID': 'b86897717', 'rating': 5},
 {'userID': 'u46053847', 'bookID': 'b10522573', 'rating': 3},
 {'userID': 'u26198785', 'bookID': 'b88505013', 'rating': 3},
 {'userID': 'u21610566', 'bookID': 'b28243795', 'rating': 5},
 {'userID': 'u28535398', 'bookID': 'b28137477', 'rating': 3},
 {'userID': 'u29625574', 'bookID': 'b37174111', 'rating': 3},
 {'userID': 'u17995164', 'bookID': 'b30165591', 'rating': 5},
 {'userID': 'u62901418', 'bookID': 'b23356648', 'rating': 4},
 {'userID': 'u44879734', 'bookID': 'b37939750', 'rating': 5},
 {'userID': 'u07012449', 'bookID': 'b74391258', 'rating': 4},
 {'userID': 'u84958932', 'bookID': 'b70203436', 'rating': 4},
 {'userI

In [6]:
# calculate initial value of alpha,beta_user and beta_book
alpha = globalAverage
userRatings = defaultdict(list)
bookRatings = defaultdict(list)

for l in dataset:
    user, book = l["userID"], l["bookID"]
    userRatings[user].append(l["rating"])
    bookRatings[book].append(l["rating"])

# beta_user,beta_book
userBias = defaultdict(float)
for u in userRatings:
    userBias[u] = globalAverage - (sum(userRatings[u]) / len(userRatings[u]))

bookBias = defaultdict(float)
for b in bookRatings:
    bookBias[b] = globalAverage - (sum(bookRatings[b]) / len(bookRatings[b]))

In [7]:
# define function for optimization
def differenciate(globalAverage, userBias, bookBias, lamb1,lamb2, epsilon):
    end = False
    globalAverage_last = 0
    userBias_last = userBias
    bookBias_last = bookBias
    MSE_last = 0
    cost_last = 0

    while not end:
        # update alpha
        globalAverage = 0
        for i in dataset:
            user, book = i["userID"], i["bookID"]
            globalAverage += i["rating"] - userBias_last[user] - bookBias_last[book]

        globalAverage = globalAverage / len(dataset)

        # update beta_user
        num_book = defaultdict(int)
        for u in userBias:
            userBias[u] = 0
        for i in dataset:
            user, book = i["userID"], i["bookID"]
            num_book[user] += 1
            userBias[user] += i["rating"] - globalAverage - bookBias_last[book]
        for u in userBias:
            userBias[u] = userBias[u] / (lamb1 + num_book[u])

        # update beta_book
        num_user = defaultdict(int)
        for b in bookBias:
            bookBias[b] = 0
        for i in dataset:
            user, book = i["userID"], i["bookID"]
            num_user[book] += 1
            bookBias[book] += i["rating"] - globalAverage - userBias[user]
        for b in bookBias:
            bookBias[b] = bookBias[b] / (lamb2 + num_user[b])

        predictions = []
        
        for d in dataset:
            user = d["userID"]
            book = d["bookID"]
            if user in userBias and book in bookBias:
                result = globalAverage + userBias[user] + bookBias[book]
            else:
                result = globalAverage
            predictions.append(result)
        
        labels = [l["rating"] for l in dataset]
        differences = [(x - y) ** 2 for x, y in zip(predictions, labels)]
        MSE = sum(differences) / len(differences)
        
        cost = MSE
        for u in userBias:
            cost += lamb1 * (userBias[u]**2)
        for i in bookBias:
            cost += lamb2 * (bookBias[i]**2)

        
        if (
            abs(MSE - MSE_last) < epsilon
            and abs(cost - cost_last) < epsilon
        ):
            end = True
        else:
            globalAverage_last = globalAverage
            userBias_last = userBias
            bookBias_last = bookBias
            MSE_last = MSE
            cost_last = cost

    return globalAverage, userBias, bookBias

In [8]:
# prediction result
def prediction(user, book):
    if user in userBias and book in bookBias:
        result = globalAverage_new + userBias_new[user] + bookBias_new[book]
    else:
        result = globalAverage_new
    return result

In [9]:
# define MSE function
def MSE(dataset, lamb):
    predictions = [prediction(d["userID"], d["bookID"]) for d in dataset]
    labels = [l["rating"] for l in dataset]
    differences = [(x - y) ** 2 for x, y in zip(predictions, labels)]
    mse = sum(differences) / len(differences)
    return mse

In [None]:
lamb = [0.5,1,1.3]
mse = []
for r in lamb:
    globalAverage_new, userBias_new, bookBias_new = differenciate(
        globalAverage, userBias, bookBias, 3,5, epsilon=1.66e-05
    )
    m = MSE(data_valid, r)
    mse.append(m)

In [44]:
globalAverage_new, userBias_new, bookBias_new = differenciate(
        globalAverage, userBias, bookBias, 3,10, epsilon=1e-05
    )
mse = MSE(data_valid, 1)

In [41]:
mse

0.923050524777207

In [45]:
mse

0.9300635803446867

In [None]:
# Optimal λ
optimal = min(mse)
optimal_lambda = lamb[mse.index(optimal)]
print("Optimal λ:", optimal_lambda, "; MSE:", optimal)

In [57]:
globalAverage_new, userBias_new, bookBias_new = differenciate(
    globalAverage, userBias, bookBias, 2,13, epsilon=1e-05)

predictions = open("predictions_Rating.txt", "w")
for l in open("pairs_Rating.txt"):
    if l.startswith("userID"):
        # header
        predictions.write(l)
        continue
    u, b = l.strip().split("-")
    p = prediction(u, b)
    predictions.write(u + "-" + b + "," + str(p) + "\n")
predictions.close()

In [None]:
2&13 - 1.12932