# CSE 158 Fall 2023, Assignment 1

### Library Import

In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
from sklearn import linear_model
import numpy as np
import string
import random
import matplotlib.pyplot as plt
import tensorflow as tf

### Utility Functions

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [3]:
def readJSON(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

In [4]:
allHours = []
for l in readJSON("train.json.gz"):
    allHours.append(l)

In [5]:
hoursTrain = allHours[:165000]
hoursValid = allHours[165000:]

In [6]:
hoursPerUser = defaultdict(list)
hoursPerItem = defaultdict(list)
for u,g,d in hoursTrain:
    r = d['hours_transformed']
    hoursPerUser[u].append((g, r))
    hoursPerItem[g].append((u, r))

In [7]:
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)
for u,g,d in hoursTrain:
    itemsPerUser[u].append(g)
    usersPerItem[g].append(u)

In [8]:
userIDs, itemIDs = {}, {}
interactions = []

for u,g,d in allHours:
    r = d['hours_transformed']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not g in itemIDs: itemIDs[g] = len(itemIDs)
    interactions.append((u,g,r))

## Part 1: Would-Play Prediction

In [9]:
# Generate a negative set
userSet = set()
gameSet = set()
playedSet = set()

for u,g,d in allHours:
    userSet.add(u)
    gameSet.add(g)
    playedSet.add((u, g))

lUserSet = list(userSet)
lGameSet = list(gameSet)

notPlayed = set()
for u,g,d in hoursValid:
    g = random.choice(lGameSet)
    while (u,g) in playedSet or (u,g) in notPlayed:
        g = random.choice(lGameSet)
    notPlayed.add((u,g))

playedValid = set()
for u,g,r in hoursValid:
    playedValid.add((u,g))

In [12]:
gameCount = defaultdict(int)
totalPlayed = 0

for u,g,_ in hoursTrain:
    gameCount[g] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/1.5: break

### Bayesian Personalized Ranking Model

In [13]:
items = list(gameSet)

In [14]:
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(gameSet)], stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userSet), K], stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(gameSet), K], stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb
    
    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p
    
    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaU) +\
                            tf.nn.l2_loss(self.gammaI))
    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui
    
    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))

In [15]:
# Leverage powerful Adam optimizer with learning rate 0.1
optimizer = tf.keras.optimizers.Adam(0.1)



In [16]:
# Initialize the model
modelBPR = BPRbatch(5, 0.00001)

In [17]:
def trainingStepBPR(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [],[],[]
        for _ in range(Nsamples):
            u,i,_ = random.choice(interactions) # positive sample
            j = random.choice(items) # negative sample
            while j in itemsPerUser[u]:
                j = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])
        
        loss = model(sampleU, sampleI, sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for 
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [18]:
# Run 150 batches of gradient descent
for i in range(150):
    obj = trainingStepBPR(modelBPR, interactions)
    if (i % 10 == 9): print("Iteration " + str(i+1) + ", objective = " + str(obj))

Iteration 10, objective = 0.51650816
Iteration 20, objective = 0.48204634
Iteration 30, objective = 0.47219062
Iteration 40, objective = 0.46221066
Iteration 50, objective = 0.45698476
Iteration 60, objective = 0.45202968
Iteration 70, objective = 0.44692212
Iteration 80, objective = 0.4490565
Iteration 90, objective = 0.44585273
Iteration 100, objective = 0.4412405
Iteration 110, objective = 0.44074723
Iteration 120, objective = 0.44516915
Iteration 130, objective = 0.44702855
Iteration 140, objective = 0.4419574
Iteration 150, objective = 0.44199258


### Evaluation of the model

In [19]:
interactionsTrain = interactions[:165000]
interactionsTest = interactions[165000:]

In [20]:
interactionsTestPerUser = defaultdict(set)
itemSet = set()
for u,i,_ in interactionsTest:
    interactionsTestPerUser[u].add(i)
    itemSet.add(i)

In [21]:
def AUCu(u, N): # N samples per user
    win = 0
    if N > len(interactionsTestPerUser[u]):
        N = len(interactionsTestPerUser[u])
    positive = random.sample(interactionsTestPerUser[u],N)
    negative = random.sample(gameSet.difference(interactionsTestPerUser[u]),N)
    for i,j in zip(positive,negative):
        si = modelBPR.predict(userIDs[u], itemIDs[i]).numpy()
        sj = modelBPR.predict(userIDs[u], itemIDs[j]).numpy()
        if si > sj:
            win += 1
    return win/N

def AUC():
    av = []
    for u in interactionsTestPerUser:
        av.append(AUCu(u, 10))
    return sum(av) / len(av)

In [22]:
AUC()

since Python 3.9 and will be removed in a subsequent version.
  positive = random.sample(interactionsTestPerUser[u],N)
since Python 3.9 and will be removed in a subsequent version.
  negative = random.sample(gameSet.difference(interactionsTestPerUser[u]),N)


0.8362142940422045

### Output Predictions

In [47]:
# Get games to predict for each user
pred_user_items = defaultdict(list)
for l in open('pairs_Played.csv'):
    if l.startswith("userID"):
        continue
    u,g = l.strip().split(',')
    if u in userIDs and g in itemIDs:
        score = modelBPR.predict(userIDs[u], itemIDs[g]).numpy()
    # If game is popular, recommend anyway
    elif g in return1:
        score = 100
    # If pair is unseen and game is not popular, would not recommend
    else:
        score = -100
    pred_user_items[u].append((g, score))

In [48]:
# Sort the game scores for each user
for key in pred_user_items:
    pred_user_items[key].sort(key=lambda x:x[1])
    pred_user_items[key].reverse()

In [49]:
# Check the data structure
pred_user_items

defaultdict(list,
            {'u04836696': [('g18529610', 1.4146807),
              ('g41031307', 1.385211),
              ('g39176429', 0.68743676),
              ('g80808634', 0.35747463),
              ('g08102513', -0.8432864),
              ('g92711177', -1.994117)],
             'u32377855': [('g46446145', 3.3840246),
              ('g34193208', 2.8545163),
              ('g81608348', 2.7257898),
              ('g22244120', 2.3636565),
              ('g56621675', 2.2103205),
              ('g85900991', 1.902482),
              ('g57016851', 1.6314924),
              ('g95702441', 1.6171944),
              ('g55811676', 1.5622554),
              ('g22161219', 1.4474436),
              ('g02273341', 1.3950536),
              ('g70852781', 1.3670961),
              ('g78336897', 1.1389309),
              ('g10444367', 1.0962311),
              ('g23131507', 0.9809073),
              ('g02637258', 0.71457875),
              ('g16246767', 0.4027836),
              ('g12792583', 0.146

In [60]:
# Obtain positive and negative labels
positive_pairs = []
negative_pairs = []
for key in pred_user_items:
    for g,s in pred_user_items[key][:len(pred_user_items[key])//2]:
        positive_pairs.append((key,g))
    for g,s in pred_user_items[key][len(pred_user_items[key])//2:]:
        negative_pairs.append((key,g))

In [61]:
# Write to the output file.
predictions = open("predictions_Played.csv", 'w')
for l in open("pairs_Played.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    if (u,g) in positive_pairs:
        pred = 1
    else:
        pred = 0
    _ = predictions.write(u + ',' + g + ',' + str(pred) + '\n')

predictions.close()

## Part 2: Time-Played Prediction

In [157]:
Hours = [d['hours_transformed'] for u,g,d in allHours]
globalAverage = sum(Hours) * 1.0 / len(Hours)

In [158]:
hoursPerUser, hoursPerItem,Rui = {}, {}, {}
for u,g,d in allHours:
    if u not in hoursPerUser:
        hoursPerUser[u] = [(g, d['hours_transformed'])]
    else:
        hoursPerUser[u].append((g, d['hours_transformed']))
    if g not in hoursPerItem:
        hoursPerItem[g] = [(u, d['hours_transformed'])]
    else:
        hoursPerItem[g].append((u, d['hours_transformed']))
    Rui[(u,g)] = d['hours_transformed']

In [159]:
betaU = {}
betaI = {}
for u in hoursPerUser:
    betaU[u] = 0

for g in hoursPerItem:
    betaI[g] = 0

In [160]:
alpha = globalAverage # Could initialize anywhere, this is a guess

In [161]:
def update_alpha():
    global alpha
    num = sum(Rui[(u,g)]- (betaU[u] + betaI[g]) for u,g,_ in allHours)
    denom = len(allHours)
    alpha = num / denom

In [162]:
def update_betaU(lamb):
    global alpha
    for user in hoursPerUser:
        num = sum(Rui[(user,g)]-(alpha+betaI[g]) for g,t in hoursPerUser[user])
        denom = lamb + len(hoursPerUser[user])
        betaU[user] = num / denom

In [163]:
def update_betaI(lamb):
    global alpha
    for item in hoursPerItem:
        num = sum(Rui[(u,item)]-(alpha+betaU[u]) for u,t in hoursPerItem[item])
        denom = lamb + len(hoursPerItem[item])
        betaI[item] = num / denom

In [164]:
def predict(user, item):
    global alpha
    if user not in hoursPerUser and item not in hoursPerItem:
        return alpha
    if user in hoursPerUser and item not in hoursPerItem:
        return alpha + betaU[user]
    if user not in hoursPerUser and item in hoursPerItem:
        return alpha + betaI[item]
    return alpha + betaU[user] + betaI[item]

In [165]:
def MSE():
    mse = sum((_['hours_transformed']-predict(u,i))**2 for u,i,_ in hoursValid) / len(hoursValid)
    return mse

In [166]:
def iterate(lamb, max_iteration=1000):
    mse = 0
    for iter in range(max_iteration):
        update_alpha()
        update_betaU(lamb)
        update_betaI(lamb)
        curr_MSE = MSE()
        if iter % 10 == 9:
            print(f"Current Iteration is {iter+1} | MSE: {curr_MSE}")
        if mse == 0 or curr_MSE <= mse:
            mse = curr_MSE
        else:
            print(f"End Iteration is {iter+1} | MSE: {curr_MSE}")
            break
    return curr_MSE

In [167]:
validMSE = iterate(lamb=5.05, max_iteration=400) # number of iterations: 350

Current Iteration is 10 | MSE: 2.750075381581338
Current Iteration is 20 | MSE: 2.7491796666696637
Current Iteration is 30 | MSE: 2.7488548824380286
Current Iteration is 40 | MSE: 2.7487263690588892
Current Iteration is 50 | MSE: 2.7486706753020087
Current Iteration is 60 | MSE: 2.7486446425530104
Current Iteration is 70 | MSE: 2.748631805667663
Current Iteration is 80 | MSE: 2.7486252583351
Current Iteration is 90 | MSE: 2.748621852027036
Current Iteration is 100 | MSE: 2.748620059973658
Current Iteration is 110 | MSE: 2.7486191113811174
Current Iteration is 120 | MSE: 2.748618607590196
Current Iteration is 130 | MSE: 2.7486183395522445
Current Iteration is 140 | MSE: 2.748618196808318
Current Iteration is 150 | MSE: 2.748618120751047
Current Iteration is 160 | MSE: 2.7486180802148827
Current Iteration is 170 | MSE: 2.748618058607241
Current Iteration is 180 | MSE: 2.7486180470884864
Current Iteration is 190 | MSE: 2.7486180409477066
Current Iteration is 200 | MSE: 2.7486180376739386


In [114]:
# validMSE = iterate(lamb=5.0, max_iteration=300) # max iterations: 350

TypeError: iterate() missing 1 required positional argument: 'lamb2'

In [132]:
# validMSE = iterate(lamb=5.06)

In [21]:
validMSE

2.9906260313858097

### Code block to tune lambda value

In [34]:
def iterate(lamb, max_iteration=50):
    mse = 0
    for iter in range(max_iteration):
        update_alpha()
        update_betaU(lamb)
        update_betaI(lamb)
        curr_mse = MSE()
        if mse == 0 or curr_mse <= mse:
            mse = curr_mse
        else:
            print(f"Current Lambda is {lamb} | MSE: {curr_mse}")
            break
        if iter == 29:
            print(f"Current Lambda is {lamb} | MSE: {curr_mse}")
    return mse

In [None]:
# Better lambda...
bestValidMSE = None
bestLamb = 0
for lamb in np.arange(4.9, 5.11, 0.01):
    validMSE = iterate(lamb, max_iteration=30)
    if bestValidMSE == None or validMSE < bestValidMSE:
        bestValidMSE = validMSE
        bestLamb = lamb

In [None]:
bestValidMSE, bestLamb # Fine-tune lambda

### Output prediction

In [168]:
predictions = open("predictions_Hours.csv", 'w')
for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    
    if u in betaU and g in betaI:
        predictions.write(u + ',' + g + ',' + str(alpha + betaU[u] + betaI[g]) + '\n')
    else:
        predictions.write(u + ',' + g + ',' + str(0) + '\n')

predictions.close()