In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model
import tensorflow as tf
from surprise.model_selection import train_test_split

In [2]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [4]:
def readJSON(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

In [5]:
answers = {}

In [6]:
# Some data structures that will be useful

In [7]:
allHours = []
for l in readJSON("train.json.gz"):
    allHours.append(l)

In [8]:
hoursTrain = allHours[:165000]
hoursValid = allHours[165000:]

In [9]:
##################################################
# Play prediction                                #
##################################################

In [10]:
hoursTrain[0]

('u70666506',
 'g49368897',
 {'userID': 'u70666506',
  'early_access': False,
  'hours': 63.5,
  'hours_transformed': 6.011227255423254,
  'found_funny': 1,
  'text': 'If you want to sit in queue for 10-20min and have 140 ping then this game is perfect for you :)',
  'gameID': 'g49368897',
  'user_id': '76561198030408772',
  'date': '2017-05-20'})

In [11]:
# Any other preprocessing...
users = set() # maintain a set of unique users
games = set() # maintain a set of unique games
pos_pairs = []
for l in readGz("train.json.gz"):
    u,g = l['userID'], l['gameID']
    users.add(u)
    games.add(g)
    pos_pairs += [(u, g)]

In [12]:
train_data = pos_pairs[:165000]
valid_data = pos_pairs[165000:] # because pos_pairs has length 175000

In [13]:
len(users), len(games)

(6710, 2437)

In [14]:
# Randomly sample games that weren't played
neg_pairs = set()
len_valid = len(hoursValid)
while True:
    user = random.sample(users, 1)[0]
    game = random.sample(games, 1)[0]
    if (user, game) not in pos_pairs:
        neg_pairs.add((user, game))
    if len(neg_pairs) == len_valid:
        break

since Python 3.9 and will be removed in a subsequent version.
  user = random.sample(users, 1)[0]
since Python 3.9 and will be removed in a subsequent version.
  game = random.sample(games, 1)[0]


In [15]:
len(neg_pairs), len(train_data)

(9999, 165000)

In [16]:
### Question 1

In [17]:
# Evaluate baseline strategy
gameCount = defaultdict(int)
totalPlayed = 0

for user,game in train_data:
    gameCount[game] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/2: break

predictions = []
for user,game in valid_data:
    if game in return1:
        predictions += [1]
    else:
        predictions += [0]
    
for user,game in neg_pairs:
    if game in return1:
        predictions += [1]
    else:
        predictions += [0]

sum_pred = sum([predictions[i]==1 for i in range(9999)]) + \
    sum([predictions[i] == 0 for i in range(9999, len(predictions))])
acc = sum_pred / (2*len(valid_data))
acc

0.677

In [18]:
answers['Q1'] = acc
answers['Q1']

0.677

In [19]:
assertFloat(answers['Q1'])

In [20]:
### Question 2

In [21]:
# Improved strategy

In [22]:
# Evaluate baseline strategy
gameCount = defaultdict(int)
totalPlayed = 0

for user,game in train_data:
    gameCount[game] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/1.5: break # Choose 67% percentile instead

predictions = []
for user,game in valid_data:
    if game in return1:
        predictions += [1]
    else:
        predictions += [0]
    
for user,game in neg_pairs:
    if game in return1:
        predictions += [1]
    else:
        predictions += [0]

sum_pred_q2 = sum([predictions[i]==1 for i in range(9999)]) + \
    sum([predictions[i] == 0 for i in range(9999, len(predictions))])
acc_q2 = sum_pred_q2 / (2*len(valid_data))
acc_q2

0.6969

In [23]:
answers['Q2'] = [1/1.5, acc_q2]
answers['Q2']

[0.6666666666666666, 0.6969]

In [24]:
assertFloatList(answers['Q2'], 2)

In [25]:
### Question 3/4

In [26]:
userPerGame = defaultdict(set) # Maps a game to the users who played it
gamePerUser = defaultdict(set) # Maps a user to the game that they played
hoursDict = {} # To retrieve an hour for a specific user/game pair

for d in hoursTrain:
    user,game = d[0], d[1]
    userPerGame[game].add(user)
    gamePerUser[user].add(game)
    hoursDict[(user, game)] = d[2]['hours']

In [27]:
userAverages = {}
gameAverages = {}

for u in gamePerUser:
    rs = [hoursDict[(u,g)] for g in gamePerUser[u]]
    userAverages[u] = sum(rs) / len(rs)

for g in userPerGame:
    rs = [hoursDict[(u,g)] for u in userPerGame[g]]
    gameAverages[g] = sum(rs) / (len(rs))

In [28]:
reviewsPerUser = defaultdict(list)
reviewsPerGame = defaultdict(list)

for d in hoursTrain:
    user,game = d[0], d[1]
    reviewsPerUser[user].append(d)
    reviewsPerGame[game].append(d)

In [29]:
reviewsPerUser["u05450000"][0]

('u05450000',
 'g52077802',
 {'hours': 16.8,
  'gameID': 'g52077802',
  'hours_transformed': 4.153805336079036,
  'early_access': False,
  'date': '2011-05-28',
  'text': 'A poorly optimized and poorly realized piece of♥♥♥♥♥♥ Stay clear of this and all future Splash Damage products.',
  'userID': 'u05450000'})

In [30]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

def predictLabel(user, game, threshold):
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d[1]
        if i2 == game: continue
        similarities.append(Jaccard(userPerGame[game], userPerGame[i2]))
    return 1 if max(similarities) > threshold else 0

In [31]:
# Evaluate baseline strategy
predictions = []
for user,game in valid_data:
    predictions.append(predictLabel(user, game, 0.03))
# print(len(predictions))
for user,game in neg_pairs:
    predictions.append(predictLabel(user, game, 0.03))
# print(len(predictions))

sum_pred_q3 = sum([predictions[i]==1 for i in range(10000)]) + \
    sum([predictions[i] == 0 for i in range(10000, len(predictions))])
acc_q3 = sum_pred_q3 / (2*len(valid_data))
acc_q3

0.70315

In [32]:
# def threshold_and_popularity(threshold_popularity=1/1.5, threshold_jaccard=0.03):
#     gameCount = defaultdict(int)
#     totalPlayed = 0
    
#     for user,game,_ in readJSON("train.json.gz"):
#         gameCount[game] += 1
#         totalPlayed += 1
        
#     mostPopular = [(gameCount[x], x) for x in gameCount]
#     mostPopular.sort()
#     mostPopular.reverse()
    
#     return1 = set()
#     count = 0
#     for ic, i in mostPopular:
#         count += ic
#         return1.add(i)
#         if count > threshold_popularity: break
    
#     correct = 0
#     for user, game in valid_data:
#         similarities = []
#         for d in reviewsPerUser[user]:
#             i2 = d[1]
#             if i2 == game: continue
#             similarities.append(Jaccard(userPerGame[game], userPerGame[i2]))
            
#         if max(similarities) > threshold_jaccard and game in return1:
#             correct += (game in gamePerUser[user]) # recommend in this case
#         else:
#             correct += (game not in gamePerUser[user]) # not recommend in this case
        
#     return correct/len(valid_data)

In [34]:
answers['Q3'] = acc_q3
answers['Q4'] = sum_pred_q3 / (2*len(valid_data))

In [35]:
assertFloat(answers['Q3'])
assertFloat(answers['Q4'])

In [36]:
threshold_popularity = 1/1.5
threshold_jaccard = 0.003

gameCount = defaultdict(int)
totalPlayed = 0

for user,game,_ in readJSON("train.json.gz"):
    gameCount[game] += 1
    totalPlayed += 1
    
mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed*threshold_popularity: break

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > threshold_popularity: break

predictions = open("HWpredictions_Played.csv", 'w')
for l in open("pairs_Played.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    userPlayed = gamePerUser[u]
    similarities = []
    # Logic...
    for d in reviewsPerUser[user]:
        i2 = d[1]
        if i2 == game: continue
        similarities.append(Jaccard(userPerGame[game], userPerGame[i2]))

    if max(similarities) > threshold_jaccard and game in return1:
        predictions.write(u + ',' + g + ',' + "1" + '\n')
    else:
        predictions.write(u + ',' + g + ',' + "0" + '\n')

predictions.close()

In [37]:
answers['Q5'] = "I confirm that I have uploaded an assignment submission to gradescope"

In [38]:
##################################################
# Hours played prediction                        #
##################################################

In [39]:
### Question 6

In [40]:
# I first tried the TF library, but it's not working...

In [41]:
# userIDs = {}
# itemIDs = {}
# interactions = []

# for d in allHours:
#     u = d[0]
#     i = d[1]
#     r = d[2]['hours_transformed']
#     if not u in userIDs: userIDs[u] = len(userIDs)
#     if not i in itemIDs: itemIDs[i] = len(itemIDs)
#     interactions.append((u,i,r))

In [42]:
# random.shuffle(interactions)
# len(interactions)

In [43]:
# nTrain = 165000
# nTest = len(interactions) - nTrain
# interactionsTrain = interactions[:nTrain]
# interactionsTest = interactions[nTrain:]

In [44]:
# itemsPerUser = defaultdict(list)
# usersPerItem = defaultdict(list)
# for u,i,r in interactionsTrain:
#     itemsPerUser[u].append(i)
#     usersPerItem[i].append(u)

In [45]:
# mu = sum([r for _,_,r in interactionsTrain] + [r for _,_,r in interactionsTest])*1.0 / len(allHours)
# mu

In [46]:
# optimizer = tf.keras.optimizers.Adam(0.1)

In [47]:
# class LatentFactorModel(tf.keras.Model):
#     def __init__(self, mu, lamb):
#         super(LatentFactorModel, self).__init__()
#         # Initialize to average
#         self.alpha = tf.Variable(mu)
#         # Initialize to small random values
#         self.betaU = tf.Variable(tf.zeros(len(userIDs)))
#         self.betaI = tf.Variable(tf.zeros(len(itemIDs)))
#         # self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
#         # self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
#         self.lamb = lamb

#     # Prediction for a single instance (useful for evaluation)
#     def predict(self, u, i):
#         p = self.alpha + self.betaU[u] + self.betaI[i] 
#         # + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
#         return p

#     # Regularizer
#     def reg(self):
#         return self.lamb * (tf.reduce_sum(self.betaU**2) +\
#                             tf.reduce_sum(self.betaI**2))
#                         # + tf.reduce_sum(self.gammaU**2) +\
#                         #     tf.reduce_sum(self.gammaI**2))
    
#     # Prediction for a sample of instances
#     def predictSample(self, sampleU, sampleI):
#         u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
#         i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
#         beta_u = tf.nn.embedding_lookup(self.betaU, u)
#         beta_i = tf.nn.embedding_lookup(self.betaI, i)
#         # gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
#         # gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
#         pred = self.alpha + beta_u + beta_i
#             #    tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
#         return pred
    
#     # Loss
#     def call(self, sampleU, sampleI, sampleR):
#         pred = self.predictSample(sampleU, sampleI)
#         r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
#         return tf.nn.l2_loss(pred - r) / len(sampleR)

In [48]:
# modelLFM = LatentFactorModel(mu, 1) # with lambda equal to 1

In [49]:
# modelLFM.trainable_variables[1]

In [50]:
# def trainingStep(model, interactions):
#     Nsamples = 50000
#     with tf.GradientTape() as tape:
#         sampleU, sampleI, sampleR = [], [], []
#         for _ in range(Nsamples):
#             u,i,r = random.choice(interactions)
#             sampleU.append(userIDs[u])
#             sampleI.append(itemIDs[i])
#             sampleR.append(r)

#         loss = model(sampleU,sampleI,sampleR)
#         loss += model.reg()
#     gradients = tape.gradient(loss, model.trainable_variables)
#     optimizer.apply_gradients((grad, var) for
#                               (grad, var) in zip(gradients, model.trainable_variables)
#                               if grad is not None)
#     return loss.numpy()

In [51]:
# for i in range(100):
#     obj = trainingStep(modelLFM, interactionsTrain)
#     if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

In [52]:
# predictions = []
# for u,g,r in interactionsTest:
#     predict = modelLFM.predict(userIDs[u], itemIDs[g]).numpy()
#     predictions.append(predict)
# len(predictions)

In [53]:
# def MSE(preds, labels):
#     diff = [(x-y)**2 for x,y in zip(preds, labels)]
#     return sum(diff) / len(diff)

In [54]:
# validlabels = [r for _,_,r in interactionsTest]
# validMSE = MSE(predictions,validlabels)
# validMSE

In [55]:
trainHours = [r[2]['hours_transformed'] for r in hoursTrain]
globalAverage = sum(trainHours) * 1.0 / len(trainHours)

In [56]:
hoursPerUser, hoursPerItem,Rui = {}, {}, {}
for u,g,d in hoursTrain:
    if u not in hoursPerUser:
        hoursPerUser[u] = [(g, d['hours_transformed'])]
    else:
        hoursPerUser[u].append((g, d['hours_transformed']))
    if g not in hoursPerItem:
        hoursPerItem[g] = [(u, d['hours_transformed'])]
    else:
        hoursPerItem[g].append((u, d['hours_transformed']))
    Rui[(u,g)] = d['hours_transformed']

In [57]:
len(hoursPerItem)

2437

In [58]:
betaU = {}
betaI = {}
for u in hoursPerUser:
    betaU[u] = 0

for g in hoursPerItem:
    betaI[g] = 0

In [59]:
alpha = globalAverage # Could initialize anywhere, this is a guess

In [60]:
def update_alpha():
    global alpha
    num = sum(Rui[(u,g)]- (betaU[u] + betaI[g]) for u,g,_ in hoursTrain)
    denom = len(hoursTrain)
    alpha = num / denom

In [61]:
def update_betaU(lamb):
    global alpha
    for user in hoursPerUser:
        num = sum(Rui[(user,g)]-(alpha+betaI[g]) for g,t in hoursPerUser[user])
        denom = lamb + len(hoursPerUser[user])
        betaU[user] = num / denom

In [62]:
def update_betaI(lamb):
    global alpha
    for item in hoursPerItem:
        num = sum(Rui[(u,item)]-(alpha+betaU[u]) for u,t in hoursPerItem[item])
        denom = lamb + len(hoursPerItem[item])
        betaI[item] = num / denom

In [63]:
def predict(user, item):
    global alpha
    if user not in hoursPerUser and item not in hoursPerItem:
        return alpha
    if user in hoursPerUser and item not in hoursPerItem:
        return alpha + betaU[user]
    if user not in hoursPerUser and item in hoursPerItem:
        return alpha + betaI[item]
    return alpha + betaU[user] + betaI[item]

In [64]:
def MSE():
    mse = sum((_['hours_transformed']-predict(u,i))**2 for u,i,_ in hoursValid) / len(hoursValid)
    return mse

In [65]:
def iterate(lamb, max_iteration=50):
    for iter in range(max_iteration):
        update_alpha()
        update_betaU(lamb)
        update_betaI(lamb)
        if iter % 10 == 9:
            validMSE = MSE()
            print(f"Current Iteration is {iter+1} | MSE: {validMSE}")
    return validMSE

In [66]:
validMSE = iterate(lamb=1)

Current Iteration is 10 | MSE: 3.0071926265486812
Current Iteration is 20 | MSE: 3.0071983181788915
Current Iteration is 30 | MSE: 3.007207678843451
Current Iteration is 40 | MSE: 3.0072192105803386
Current Iteration is 50 | MSE: 3.0072318058212737


In [67]:
answers['Q6'] = validMSE
answers['Q6']

3.0072318058212737

In [68]:
assertFloat(answers['Q6'])

In [69]:
### Question 7

In [70]:
betaU

{'u70666506': 0.561717760182747,
 'u18612571': -0.30620644592043056,
 'u34283088': -0.8059551326713806,
 'u16220374': 0.1666061561338239,
 'u01499286': 0.06467718598950013,
 'u73063505': 0.35527637319587607,
 'u29223775': 0.2723441619223891,
 'u44401308': 0.39708496785852593,
 'u45027672': -0.30556202591176285,
 'u33908704': 0.5882204459425329,
 'u27998358': -0.45511111708994056,
 'u36214177': -0.6908601294519715,
 'u73747744': 0.4710871837790503,
 'u97936673': -0.43666348979130337,
 'u25365202': 0.4614244047419814,
 'u08631099': -0.049603600861158184,
 'u52584928': -0.009397514896310464,
 'u09520763': 0.3953237645309204,
 'u04893836': -0.20442912761159954,
 'u58381940': -0.1989497475284934,
 'u85007552': -0.5915287721886622,
 'u48369340': 0.05977793091709518,
 'u79530461': -0.8602895295848747,
 'u44157494': 0.563557200611956,
 'u70118164': -0.9722289779922593,
 'u21352780': -0.11118795414021432,
 'u74354158': -0.22420719862417135,
 'u67890036': -0.8747151082618353,
 'u90421779': -0.28

In [71]:
max_beta_u_id, max_beta_u = max(betaU, key=betaU.get), max(betaU.values())
min_beta_u_id, min_beta_u = min(betaU, key=betaU.get), min(betaU.values())
max_beta_I_id, max_beta_I = max(betaI, key=betaI.get), max(betaI.values())
min_beta_I_id, min_beta_I = min(betaI, key=betaI.get), min(betaI.values())


In [72]:
print("Maximum betaU = " + str(max_beta_u_id) + ' (' + str(max_beta_u) + ')')
print("Maximum betaI = " + str(max_beta_I_id) + ' (' + str(max_beta_I) + ')')
print("Minimum betaU = " + str(min_beta_u_id) + ' (' + str(min_beta_u) + ')')
print("Minimum betaI = " + str(min_beta_I_id) + ' (' + str(min_beta_I) + ')')

Maximum betaU = u60898505 (5.823938907805638)
Maximum betaI = g17604638 (5.2006848795015035)
Minimum betaU = u13037838 (-3.0081104660224804)
Minimum betaI = g84397720 (-3.102569586766713)


In [73]:
answers['Q7'] = [max_beta_u, min_beta_u, max_beta_I, min_beta_I]
answers['Q7']

[5.823938907805638,
 -3.0081104660224804,
 5.2006848795015035,
 -3.102569586766713]

In [74]:
assertFloatList(answers['Q7'], 4)

In [75]:
### Question 8

In [76]:
def iterate(lamb, max_iteration=50):
    for iter in range(max_iteration):
        update_alpha()
        update_betaU(lamb)
        update_betaI(lamb)
        if iter == 29:
            validMSE = MSE()
            print(f"Current Lambda is {lamb} | MSE: {validMSE}")
    return validMSE

In [77]:
# Better lambda...
bestValidMSE = None
bestLamb = 0
for lamb in np.arange(0, 10, 0.5):
    validMSE = iterate(lamb, max_iteration=30)
    if bestValidMSE == None or validMSE < bestValidMSE:
        bestValidMSE = validMSE
        bestLamb = lamb

Current Lambda is 0.0 | MSE: 3.020383624199873
Current Lambda is 0.5 | MSE: 3.0130985022679604
Current Lambda is 1.0 | MSE: 3.007286603471909
Current Lambda is 1.5 | MSE: 3.002647196268911
Current Lambda is 2.0 | MSE: 2.998964985993777
Current Lambda is 2.5 | MSE: 2.9960919506464885
Current Lambda is 3.0 | MSE: 2.99391949247759
Current Lambda is 3.5 | MSE: 2.992358904257144
Current Lambda is 4.0 | MSE: 2.9913330440900068
Current Lambda is 4.5 | MSE: 2.9907748213201613
Current Lambda is 5.0 | MSE: 2.9906266342099626
Current Lambda is 5.5 | MSE: 2.9908391713193843
Current Lambda is 6.0 | MSE: 2.9913700500165152
Current Lambda is 6.5 | MSE: 2.9921826478604774
Current Lambda is 7.0 | MSE: 2.9932451656321772
Current Lambda is 7.5 | MSE: 2.9945298734106465
Current Lambda is 8.0 | MSE: 2.996012495647076
Current Lambda is 8.5 | MSE: 2.9976717046107955
Current Lambda is 9.0 | MSE: 2.9994887000978383
Current Lambda is 9.5 | MSE: 3.0014468586000094


In [78]:
answers['Q8'] = (bestLamb, bestValidMSE)
answers['Q8']

(5.0, 2.9906266342099626)

In [79]:
assertFloatList(answers['Q8'], 2)

In [80]:
predictions = open("HWpredictions_Hours.csv", 'w')
for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    
    if u in betaU and g in betaI:
        predictions.write(u + ',' + g + ',' + str(alpha + betaU[u] + betaI[g]) + '\n')
    else:
        predictions.write(u + ',' + g + ',' + str(0) + '\n')

predictions.close()

In [81]:
f = open("answers_hw3.txt", 'w')
f.write(str(answers) + '\n')
f.close()