In [1]:
import gzip
from collections import defaultdict
from sklearn import svm
import numpy
import random
from sklearn import linear_model
import statistics

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [3]:
def readJSON(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

In [4]:
allHours = []
for l in readJSON("train.json.gz"):
    allHours.append(l)

In [5]:
hoursTrain = allHours
hoursValid = allHours[165000:]
hoursPerUser = defaultdict(list)
hoursPerItem = defaultdict(list)
itemsPerUser = defaultdict(list)
for u,g,d in hoursTrain:
    r = d['hours_transformed']
    hoursPerUser[u].append((g,r))
    hoursPerItem[g].append((u,r))
    itemsPerUser[u].append(g)

In [6]:
##################################################
# Play prediction                                #
##################################################

In [7]:
# !pip install tensorflow

In [8]:
userIDs = {}
itemIDs = {}
interactions = []

for d in hoursTrain:
    u = d[0]
    i = d[1]
    o = d[2]['hours_transformed']
    r = 1
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)

    interactions.append((u,i,r))

items = list(itemIDs.keys())

In [9]:
import tensorflow as tf
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb, precomputed_gammaU, precomputed_gammaI):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(precomputed_gammaU)
        self.gammaI = tf.Variable(precomputed_gammaI)
        # Regularization coefficient
        self.lamb = lamb

    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaU) +\
                            tf.nn.l2_loss(self.gammaI))
    
    def score(self , sampleU , sampleI):
        u = tf. convert_to_tensor (sampleU , dtype=tf.int32)
        i = tf. convert_to_tensor (sampleI , dtype=tf.int32)
        beta_i = tf.nn. embedding_lookup (self.betaI , i)
        gamma_u = tf.nn. embedding_lookup (self.gammaU , u)
        gamma_i = tf.nn. embedding_lookup (self.gammaI , i)
        x_ui = beta_i + tf. reduce_sum (tf.multiply(gamma_u , gamma_i), 1)
        return x_ui


    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))




In [10]:
def trainingStepBPR(model, interactions, optimizer):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u,i,_ = random.choice(interactions) # positive sample
            j = random.choice(items) # negative sample
            while j in itemsPerUser[u]:
                j = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])

        loss = model(sampleU,sampleI,sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [11]:
num_models = 5
modelBPRList = []
K = 2
precomputed_gammaU = tf.random.normal([len(userIDs),K],stddev=0.001)
precomputed_gammaI = tf.random.normal([len(itemIDs),K],stddev=0.001)
for _ in range(num_models):
    best_loss = float('inf')
    patience = 3  # For early stopping
    count = 0  # Epochs without improvement  
    optimizer = tf.keras.optimizers.Adam(0.01)
    modelBPR = BPRbatch(K, 0.00001, precomputed_gammaU, precomputed_gammaI)
    
    for i in range(100):
        obj = trainingStepBPR(modelBPR, interactions, optimizer)
        if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

        if obj < best_loss:
            best_loss = obj
            count = 0
        else:
            count += 1
        if count == patience:
            print("Early stopping triggered!")
            break
    modelBPRList.append(modelBPR)
    precomputed_gammaU = modelBPR.gammaU.numpy()
    precomputed_gammaI = modelBPR.gammaI.numpy()


iteration 10, objective = 0.6613888
iteration 20, objective = 0.6307914
iteration 30, objective = 0.60287046
iteration 40, objective = 0.57360584
iteration 50, objective = 0.54614544
iteration 60, objective = 0.5200356
iteration 70, objective = 0.503294
Early stopping triggered!
iteration 10, objective = 0.53230494
iteration 20, objective = 0.5112059
iteration 30, objective = 0.50396127
iteration 40, objective = 0.4960288
Early stopping triggered!
iteration 10, objective = 0.50527835
iteration 20, objective = 0.49837834
iteration 30, objective = 0.4979308
Early stopping triggered!
Early stopping triggered!
Early stopping triggered!


In [12]:
values = []
predictions = open("predictions_Played.csv", 'w')
for l in open("pairs_Played.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    BPRvalues = []
    for modelBPR in modelBPRList:
        value = modelBPR.predict(userIDs.get(u), itemIDs.get(g)).numpy()
        BPRvalues.append(value)
    if isinstance(BPRvalues, list):
        all_integers = all(isinstance(x, numpy.float32) for x in BPRvalues)
        if all_integers:
            mode = statistics.mean(BPRvalues)
            values.append((u, g, mode))

In [13]:
data_dict = data_dict = {(item[0], item[1]): item[2] for item in values}
sorted_values = sorted(values, key=lambda x: (x[0], -x[2]))

# Index of each game for each user
user_game_indices = defaultdict(list)

for idx, item in enumerate(sorted_values):
    user_game_indices[item[0]].append(item[1]) 
    
# Determine if negative / positive half
def game_position(user, game):
    games_for_user = user_game_indices.get(user)
    if games_for_user is None:
        return -1  

    total_games = len(games_for_user)
    sorted_games = sorted(games_for_user, key=lambda x: data_dict[(user, x)], reverse=True)
    half_index = total_games // 2

    return int(sorted_games.index(game) < half_index) if game in games_for_user else -1


In [14]:
for l in open("pairs_Played.csv"):
    if l.startswith("userID"):
        continue
    u,g = l.strip().split(',')
    pred = game_position(u, g)
    
    _ = predictions.write(u + ',' + g + ',' + str(pred) + '\n')

predictions.close()


In [15]:
##################################################
# Hours played prediction                        #
##################################################

In [16]:
trainHours = [r[2]['hours_transformed'] for r in hoursTrain]
globalAverage = sum(trainHours) * 1.0 / len(trainHours)

In [17]:
validMSE = 0
for u,g,d in hoursValid:
    r = d['hours_transformed']
    se = (r - globalAverage)**2
    validMSE += se

validMSE /= len(hoursValid)

print("Validation MSE (average only) = " + str(validMSE))

Validation MSE (average only) = 5.315913624424781


In [18]:
import tensorflow as tf
betaU = {}
betaI = {}
gammaU = {}
gammaI = {}
for u in hoursPerUser:
    betaU[u] = 0

for g in hoursPerItem:
    betaI[g] = 0

In [19]:
alpha = globalAverage # Could initialize anywhere, this is a guess

In [20]:
def iterate(lamb, betaU, betaI, hoursTrain, hoursPerUser, hoursPerItem):
    # Calculate new alpha
    newAlpha = sum(d['hours_transformed'] - (betaU[u] + betaI[g]) for u, g, d in hoursTrain)
    alpha = newAlpha / len(hoursTrain)
    
    # Update betaU
    for u in hoursPerUser:
        ratings_sum = sum(r - (alpha + betaI[g]) for g, r in hoursPerUser[u])
        betaU[u] = ratings_sum / (lamb + len(hoursPerUser[u]))
        
    
    # Update betaI
    for g in hoursPerItem:
        ratings_sum = sum(r - (alpha + betaU[u]) for u, r in hoursPerItem[g])
        betaI[g] = ratings_sum / (lamb + len(hoursPerItem[g]))
    
    
    # Calculate MSE
    mse = sum((d['hours_transformed'] - (alpha + betaU[u] + betaI[g]))**2 for u, g, d in hoursTrain)
    mse /= len(hoursTrain)
    
    # Calculate regularization term
    reg_term = sum(betaU[u]**2 for u in betaU) + sum(betaI[g]**2 for g in betaI) + alpha * (sum(abs(betaU[u]) for u in betaU) + sum(abs(betaI[g]) for g in betaI))
    return mse, mse + lamb * reg_term

In [21]:
mse,objective = iterate(0.8565, betaU, betaI, hoursTrain, hoursPerUser, hoursPerItem)
newMSE,newObjective = iterate(0.8565, betaU, betaI, hoursTrain, hoursPerUser, hoursPerItem)
iterations = 2

In [22]:
# Better lambda...
num_bags = 5

bag_betaU = [{} for _ in range(num_bags)]
bag_betaI = [{} for _ in range(num_bags)]

# Perform bagging
for bag_index in range(num_bags):
    # Shuffle the training data for each bag
    random.shuffle(hoursTrain)
    
    # Initialize betaU and betaI for this bag
    for u in hoursPerUser:
        bag_betaU[bag_index][u] = 0
    
    for g in hoursPerItem:
        bag_betaI[bag_index][g] = 0
    iterations = 1
    best_MSE = float('inf')
    patience = 3  # For early stopping
    count = 0  # Epochs without improvement  

    while iterations < 10 and objective - newObjective > 0.0001 and mse > newMSE:
        mse, objective = newMSE, newObjective
        newMSE, newObjective = iterate(0.8565, betaU, betaI, hoursTrain, hoursPerUser, hoursPerItem)
        iterations += 1

        
        print(iterations)
        print("Objective after " + str(iterations) + " iterations = " + str(newObjective))
        print("MSE after " + str(iterations) + " iterations = " + str(newMSE))

        if newMSE < best_MSE:
            best_MSE = newMSE
            count = 0
        else:
            count += 1
        if count == patience:
            print("Early stopping triggered!")
            break
    bag_betaU[bag_index] = betaU.copy()
    bag_betaI[bag_index] = betaI.copy()

2
Objective after 2 iterations = 24745.738259723013
MSE after 2 iterations = 2.7618841657875293
3
Objective after 3 iterations = 24729.203711016482
MSE after 3 iterations = 2.761111074599396
4
Objective after 4 iterations = 24674.911210656603
MSE after 4 iterations = 2.7610075516520167
5
Objective after 5 iterations = 24607.91581118694
MSE after 5 iterations = 2.7609847396528155
6
Objective after 6 iterations = 24538.135403522236
MSE after 6 iterations = 2.760976513384714
7
Objective after 7 iterations = 24468.46845502204
MSE after 7 iterations = 2.7609717040275252
8
Objective after 8 iterations = 24399.61161357158
MSE after 8 iterations = 2.76096781256462
9
Objective after 9 iterations = 24331.734400126985
MSE after 9 iterations = 2.7609642091838382
10
Objective after 10 iterations = 24264.948430985438
MSE after 10 iterations = 2.7609607282087105
2
Objective after 2 iterations = 24199.32445794654
MSE after 2 iterations = 2.760957324896792
3
Objective after 3 iterations = 24134.7377077

In [23]:
mean_betaU = {}
mean_betaI = {}

for u in hoursPerUser:
    mean_betaU[u] = sum(bag_betaU[bag_index].get(u, 0) for bag_index in range(num_bags)) / num_bags

for g in hoursPerItem:
    mean_betaI[g] = sum(bag_betaI[bag_index].get(g, 0) for bag_index in range(num_bags)) / num_bags

predictions = open("predictions_Hours.csv", 'w')
for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u, g = l.strip().split(',')
    bu = mean_betaU.get(u, 0)
    bi = mean_betaI.get(g, 0)
    _ = predictions.write(u + ',' + g + ',' + str(alpha + bu + bi) + '\n')

predictions.close()