In [1]:
# NOTE RATING IS 0-5 INTEGER ONLY

In [2]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from scipy.optimize import minimize
from itertools import islice
import csv

In [3]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [6]:
def readCSV(path):
    with open(path, mode='r') as file:
        reader = csv.reader(file)
        # Skip the header row
        next(reader, None)
        # Loop through the rows
        for row in reader:
            # Extract the first 4 columns
            u,i,d,r = row[:4]
            yield u, i, d, float(r)


In [7]:
train_dataset = []
ratings_dict = defaultdict(float)
itemsPerUser = defaultdict(set)
usersPerItem = defaultdict(set)
ratingsPerItem = defaultdict(list)
ratingsPerUser = defaultdict(list)
global_fallback_rating = 0
for l in readCSV("data/interactions_train.csv"):
    train_dataset.append(l)
    user, recipe, date, rating = l
    itemsPerUser[user].add(recipe)
    usersPerItem[recipe].add(user)
    global_fallback_rating += rating
    ratingsPerItem[recipe].append((user, rating))
    ratingsPerUser[user].append((recipe, rating))
    


global_fallback_rating = global_fallback_rating / len(train_dataset)
print(global_fallback_rating)


4.574089892559891


In [43]:
global_fallback_rating = 5

In [8]:
def baseline_predictor(user, item, date):
    return global_fallback_rating



In [9]:
validation_dataset = []
validation_reals = []
validation_preds = []
for l in readCSV("data/interactions_validation.csv"):
    validation_dataset.append(l)
    user, recipe, date, rating = l
    validation_reals.append(rating)

    

In [10]:
def Jaccard(set1, set2):
    intersect = len(set1.intersection(set2))
    union = len(set2.union(set1))
    return intersect/union if union > 0 else 0
    
def Jaccard_recipe(r1, r2):
    raters1 = usersPerItem[r1]
    raters2 = usersPerItem[r2]
    return Jaccard(raters1,  raters2)
    
def Jaccard_user(user1, user2):
    userReads1 = itemsPerUser[user1]
    userReads2 = itemsPerUser[user2]
    return Jaccard(userReads1, userReads2)

In [39]:
def train_model(reg_param, learning_rate, tolerance):
    alpha = numpy.mean([r for _,_,_,r in train_dataset])
    user_bias = defaultdict(float)
    item_bias = defaultdict(float)
    # Iterative procedure to update parameters
    max_iterations = 200
    best_sse, best_rate, best_user_bias, best_item_bias = float('inf'), None, None, None
    total_sse = 0
    early_stop_t = 5
    
    def predict(u, b):
        if u in user_bias and b in item_bias:
            temp = alpha + user_bias[u] + item_bias[b]
        elif u in user_bias:
            temp = alpha + user_bias[u]
        elif b in item_bias:
            temp = alpha + item_bias[b]
        else: 
            temp = alpha
        if abs(temp - round(temp)) <= tolerance:
            temp = round(temp)
        return temp

    for iteration in range(max_iterations):
        prev_alpha = alpha
        prev_user_bias = user_bias.copy()
        prev_item_bias = item_bias.copy()

        alpha = sum(r - (user_bias[u] + item_bias[i]) for u, i,_, r in train_dataset) / len(train_dataset)

        for u in ratingsPerUser:
            userRatings = ratingsPerUser[u]
            numUserRatings = len(userRatings)
            if numUserRatings > 0:
                rsum = sum(r - (alpha + item_bias[i]) for i, r in userRatings) 
                user_bias[u] = rsum / (numUserRatings + reg_param)

        for i in ratingsPerItem:
            itemRatings = ratingsPerItem[i]
            numRatings = len(itemRatings)
            if numRatings > 0:
                item_bias[i] = sum(r - (alpha + user_bias[u]) for u,r in itemRatings) / (reg_param + numRatings)

        #calculate sse in validation set
        preds = [predict(u, b) for u, b,_,_ in validation_dataset]
        sse = mean_squared_error(validation_reals, preds)
            
        print(f"Iteration {iteration + 1}: validation sse = {sse}")
        # Early stopping
        if sse < best_sse:
            best_sse = sse
            best_model = (alpha, user_bias.copy(), item_bias.copy())
            early_stop_counter = 0  # Reset patience
        else:
            early_stop_counter += 1
            if early_stop_counter >= early_stop_t:
                print("Early stopping triggered.")
                break
    return alpha, user_bias, item_bias

def get_train_model_mse(alpha, user_bias, item_bias, tolerance):
    # Calculate MSE on the validation set
    predictions = []
    actual_ratings = []
    for u, i, d, r in validation_dataset:
        pred = alpha + user_bias[u] + item_bias[i]
        if abs(pred - round(pred)) <= tolerance: pred = round(pred)
        predictions.append(pred)
        actual_ratings.append(r)
    mse = mean_squared_error(actual_ratings, predictions)
    print(f"MSE = {mse}")
    return mse

In [44]:
# MSE = 1.575032439397132, lm: 3.1
#lm: 2.95, learn: 0.3 tolerance: 0.009999999999999998 mse: 1.581717799135119
# mse: 1.5813980268600554  learn: 0.3 tolerance: 0.04 lm 3
# est lm: 3.1, best mse: 1.580479671270761, best learn: 0.3 best tolerance: 0.060000000000000005
# best lm: 3.1, best mse: 1.580479671270761, best learn: 0.3 best tolerance: 0.060000000000000005
# best lm: 3.1, best mse: 1.5804654895942463, best learn: 0.3 best tolerance: 0.065
# best lm: 3.1, best mse: 1.5746003993638258, best learn: 0.3 best tolerance: 0.07100000000000001
lambda_values = [3.1]
best_lambda = None
best_mse = float('inf')
best_user_bias, best_item_bias, best_learn, best_tolerance = None, None, None, None
for l in lambda_values:
    for learn in [0.3]:
        for tolerance in [0.071]:
            print(f"lm: {l}, learn: {learn} tolerance: {tolerance}")
            alpha, user_bias, item_bias = train_model(l, learn, tolerance)
            mse = get_train_model_mse(alpha, user_bias, item_bias,tolerance)
            print(f"best mse: {mse}")
            if mse < best_mse:
                best_mse = mse
                best_lambda = l
                best_user_bias = user_bias.copy()
                best_item_bias = item_bias.copy()
                best_tolerance = tolerance
                best_learn = learn
                best_alpha = alpha
        
lamb = best_lambda
validMSE = best_mse
print(f"best lm: {best_lambda}, best mse: {best_mse}, best learn: {best_learn} best tolerance: {best_tolerance}")

lm: 3.1, learn: 0.3 tolerance: 0.071
Iteration 1: validation sse = 1.6034029923658129
Iteration 2: validation sse = 1.6107269309291223
Iteration 3: validation sse = 1.6071734121748065
Iteration 4: validation sse = 1.603001827500647
Iteration 5: validation sse = 1.5997357770108607
Iteration 6: validation sse = 1.596965918243216
Iteration 7: validation sse = 1.5945348501689616
Iteration 8: validation sse = 1.5927761717804791
Iteration 9: validation sse = 1.5911445243424116
Iteration 10: validation sse = 1.5897322716043152
Iteration 11: validation sse = 1.5882728182935113
Iteration 12: validation sse = 1.5870579699405227
Iteration 13: validation sse = 1.5860689584886851
Iteration 14: validation sse = 1.584865434311437
Iteration 15: validation sse = 1.5839030387332294
Iteration 16: validation sse = 1.583075973649536
Iteration 17: validation sse = 1.582410710341175
Iteration 18: validation sse = 1.5817794263930725
Iteration 19: validation sse = 1.581187756396905
Iteration 20: validation sse

In [45]:
# mse on test set
test_dataset = []
test_reals = []
predictions = []
for l in readCSV("data/interactions_test.csv"):
    user, recipe, date, rating = l
    test_reals.append(rating)
    pred = best_alpha + best_user_bias[user] + best_item_bias[recipe]
    if abs(pred - round(pred)) <= best_tolerance: pred = round(pred)
    predictions.append(pred)

print(f"test set MSE: {mean_squared_error(test_reals, predictions)}")


test set MSE: 1.6798063243865557
