In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model
import tensorflow as tf
from surprise.model_selection import train_test_split

In [348]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [349]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [350]:
def readJSON(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

In [351]:
answers = {}

In [352]:
# Some data structures that will be useful

In [353]:
allHours = []
for l in readJSON("train.json.gz"):
    allHours.append(l)

In [354]:
hoursTrain = allHours[:165000]
hoursValid = allHours[165000:]

In [11]:
##################################################
# Play prediction                                #
##################################################

In [12]:
hoursTrain[0]

('u70666506',
 'g49368897',
 {'userID': 'u70666506',
  'early_access': False,
  'hours': 63.5,
  'hours_transformed': 6.011227255423254,
  'found_funny': 1,
  'text': 'If you want to sit in queue for 10-20min and have 140 ping then this game is perfect for you :)',
  'gameID': 'g49368897',
  'user_id': '76561198030408772',
  'date': '2017-05-20'})

In [13]:
# Any other preprocessing...
users = set() # maintain a set of unique users
games = set() # maintain a set of unique games
pos_pairs = []
for l in readGz("train.json.gz"):
    u,g = l['userID'], l['gameID']
    users.add(u)
    games.add(g)
    pos_pairs += [(u, g)]

In [14]:
train_data = pos_pairs[:165000]
valid_data = pos_pairs[165000:] # because pos_pairs has length 175000

In [15]:
len(users), len(games)

(6710, 2437)

In [16]:
# Randomly sample games that weren't played
neg_pairs = set()
len_valid = len(hoursValid)
while True:
    user = random.sample(users, 1)[0]
    game = random.sample(games, 1)[0]
    if (user, game) not in pos_pairs:
        neg_pairs.add((user, game))
    if len(neg_pairs) == len_valid:
        break

since Python 3.9 and will be removed in a subsequent version.
  user = random.sample(users, 1)[0]
since Python 3.9 and will be removed in a subsequent version.
  game = random.sample(games, 1)[0]


In [17]:
len(neg_pairs), len(train_data)

(9999, 165000)

In [18]:
### Question 1

In [19]:
# Evaluate baseline strategy
gameCount = defaultdict(int)
totalPlayed = 0

for user,game in train_data:
    gameCount[game] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/2: break

predictions = []
for user,game in valid_data:
    if game in return1:
        predictions += [1]
    else:
        predictions += [0]
    
for user,game in neg_pairs:
    if game in return1:
        predictions += [1]
    else:
        predictions += [0]

sum_pred = sum([predictions[i]==1 for i in range(9999)]) + \
    sum([predictions[i] == 0 for i in range(9999, len(predictions))])
acc = sum_pred / (2*len(valid_data))
acc

0.67905

In [20]:
answers['Q1'] = acc
answers['Q1']

0.67905

In [21]:
assertFloat(answers['Q1'])

In [20]:
### Question 2

In [21]:
# Improved strategy

In [22]:
# Evaluate baseline strategy
gameCount = defaultdict(int)
totalPlayed = 0

for user,game in train_data:
    gameCount[game] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/1.5: break # Choose 67% percentile instead

predictions = []
for user,game in valid_data:
    if game in return1:
        predictions += [1]
    else:
        predictions += [0]
    
for user,game in neg_pairs:
    if game in return1:
        predictions += [1]
    else:
        predictions += [0]

sum_pred_q2 = sum([predictions[i]==1 for i in range(9999)]) + \
    sum([predictions[i] == 0 for i in range(9999, len(predictions))])
acc_q2 = sum_pred_q2 / (2*len(valid_data))
acc_q2

0.70395

In [19]:
answers['Q2'] = [1/1.5, acc_q2]
answers['Q2']

[0.6666666666666666, 0.7046]

In [20]:
assertFloatList(answers['Q2'], 2)

In [21]:
### Question 3/4

In [22]:
userPerGame = defaultdict(set) # Maps a game to the users who played it
gamePerUser = defaultdict(set) # Maps a user to the game that they played
hoursDict = {} # To retrieve an hour for a specific user/game pair

for d in hoursTrain:
    user,game = d[0], d[1]
    userPerGame[game].add(user)
    gamePerUser[user].add(game)
    hoursDict[(user, game)] = d[2]['hours']

In [23]:
userAverages = {}
gameAverages = {}

for u in gamePerUser:
    rs = [hoursDict[(u,g)] for g in gamePerUser[u]]
    userAverages[u] = sum(rs) / len(rs)

for g in userPerGame:
    rs = [hoursDict[(u,g)] for u in userPerGame[g]]
    gameAverages[g] = sum(rs) / (len(rs))

In [24]:
reviewsPerUser = defaultdict(list)
reviewsPerGame = defaultdict(list)

for d in hoursTrain:
    user,game = d[0], d[1]
    reviewsPerUser[user].append(d)
    reviewsPerGame[game].append(d)

In [25]:
reviewsPerUser["u05450000"][0]

('u05450000',
 'g52077802',
 {'hours': 16.8,
  'gameID': 'g52077802',
  'hours_transformed': 4.153805336079036,
  'early_access': False,
  'date': '2011-05-28',
  'text': 'A poorly optimized and poorly realized piece of♥♥♥♥♥♥ Stay clear of this and all future Splash Damage products.',
  'userID': 'u05450000'})

In [26]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

def predictLabel(user, game, threshold):
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d[1]
        if i2 == game: continue
        similarities.append(Jaccard(userPerGame[game], userPerGame[i2]))
    return 1 if max(similarities) > threshold else 0

In [27]:
# Evaluate baseline strategy
predictions = []
for user,game in valid_data:
    predictions.append(predictLabel(user, game, 0.03))
# print(len(predictions))
for user,game in neg_pairs:
    predictions.append(predictLabel(user, game, 0.03))
# print(len(predictions))

sum_pred_q3 = sum([predictions[i]==1 for i in range(10000)]) + \
    sum([predictions[i] == 0 for i in range(10000, len(predictions))])
acc_q3 = sum_pred_q3 / (2*len(valid_data))
acc_q3

0.70635

In [28]:
def threshold_and_popularity(threshold_popularity=1/1.5, threshold_jaccard=0.03):
    gameCount = defaultdict(int)
    totalPlayed = 0
    
    for user,game,_ in readJSON("train.json.gz"):
        gameCount[game] += 1
        totalPlayed += 1
        
    mostPopular = [(gameCount[x], x) for x in gameCount]
    mostPopular.sort()
    mostPopular.reverse()
    
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > threshold_popularity: break
    
    correct = 0
    for user, game in valid_data:
        similarities = []
        for d in reviewsPerUser[user]:
            i2 = d[1]
            if i2 == game: continue
            similarities.append(Jaccard(userPerGame[game], userPerGame[i2]))
            
        if max(similarities) > threshold_jaccard and game in return1:
            correct += (game in gamePerUser[user]) # recommend in this case
        else:
            correct += (game not in gamePerUser[user]) # not recommend in this case
        
    return correct/len(valid_data)

In [29]:
acc_q4 = threshold_and_popularity()
acc_q4

0.9945

In [34]:
answers['Q3'] = acc_q3
answers['Q4'] = acc_q4

In [35]:
assertFloat(answers['Q3'])
assertFloat(answers['Q4'])

In [30]:
threshold_popularity = 1/1.5
threshold_jaccard = 0.003

gameCount = defaultdict(int)
totalPlayed = 0

for user,game,_ in readJSON("train.json.gz"):
    gameCount[game] += 1
    totalPlayed += 1
    
mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed*threshold_popularity: break

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > threshold_popularity: break

predictions = open("HWpredictions_Played.csv", 'w')
for l in open("pairs_Played.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    userPlayed = gamePerUser[u]
    similarities = []
    # Logic...
    for d in reviewsPerUser[user]:
        i2 = d[1]
        if i2 == game: continue
        similarities.append(Jaccard(userPerGame[game], userPerGame[i2]))

    if max(similarities) > threshold_jaccard and game in return1:
        predictions.write(u + ',' + g + ',' + "1" + '\n')
    else:
        predictions.write(u + ',' + g + ',' + "0" + '\n')

predictions.close()

In [38]:
##################################################
# Hours played prediction                        #
##################################################

In [508]:
hoursTrain = allHours[:165000]
hoursValid = allHours[165000:]

In [538]:
hours = [r[2]['hours_transformed'] for r in allHours]
globalAverage = sum(hours) * 1.0 / len(hours)
hoursPerUser, hoursPerItem,Rui = {}, {}, {}
for u,g,d in allHours:
    if u not in hoursPerUser:
        hoursPerUser[u] = [(g, d['hours_transformed'])]
    else:
        hoursPerUser[u].append((g, d['hours_transformed']))
    if g not in hoursPerItem:
        hoursPerItem[g] = [(u, d['hours_transformed'])]
    else:
        hoursPerItem[g].append((u, d['hours_transformed']))
    Rui[(u,g)] = d['hours_transformed']
betaU = {}
betaI = {}
for u in hoursPerUser:
    betaU[u] = 0

for g in hoursPerItem:
    betaI[g] = 0

In [539]:
alpha = globalAverage # Could initialize anywhere, this is a guess

In [540]:
def update_alpha():
    global alpha
    num = sum(Rui[(u,g)]- (betaU[u] + betaI[g]) for u,g,_ in allHours)
    denom = len(allHours)
    alpha = num / denom

In [541]:
def update_betaU(lamb):
    global alpha
    for user in hoursPerUser:
        num = sum(Rui[(user,g)]-(alpha+betaI[g]) for g,t in hoursPerUser[user])
        denom = lamb + len(hoursPerUser[user])
        betaU[user] = num / denom

In [542]:
def update_betaI(lamb):
    global alpha
    for item in hoursPerItem:
        num = sum(Rui[(u,item)]-(alpha+betaU[u]) for u,t in hoursPerItem[item])
        denom = lamb + len(hoursPerItem[item])
        betaI[item] = num / denom

In [543]:
def predict(user, item):
    global alpha
    if user not in hoursPerUser and item not in hoursPerItem:
        return alpha
    if user in hoursPerUser and item not in hoursPerItem:
        return alpha + betaU[user]
    if user not in hoursPerUser and item in hoursPerItem:
        return alpha + betaI[item]
    return alpha + betaU[user] + betaI[item]

In [544]:
def MSE():
    mse = sum((_['hours_transformed']-predict(u,i))**2 for u,i,_ in hoursValid) / len(hoursValid)
    return mse

In [545]:
def iterate(lamb, max_iteration=1000):
    mse = 0
    counter = 0
    for iter in range(max_iteration):
        update_alpha()
        update_betaU(lamb)
        update_betaI(lamb)
        curr_MSE = MSE()
        if iter % 10 == 9:
            print(f"Current Iteration is {iter+1} | MSE: {curr_MSE}")
        if mse == 0 or curr_MSE <= mse:
            mse = curr_MSE
        else:
            counter+=1
        if counter ==1:
            print(f"End Iteration is {iter+1} | MSE: {curr_MSE}")
            break
    return curr_MSE

In [546]:
validMSE = iterate(lamb=5)

Current Iteration is 10 | MSE: 2.749665686371287
Current Iteration is 20 | MSE: 2.748783972606049
Current Iteration is 30 | MSE: 2.748462339139309
Current Iteration is 40 | MSE: 2.748334559412922
Current Iteration is 50 | MSE: 2.7482790253596647
Current Iteration is 60 | MSE: 2.7482529964938864
Current Iteration is 70 | MSE: 2.7482401198914124
Current Iteration is 80 | MSE: 2.748233526497276
Current Iteration is 90 | MSE: 2.7482300807254734
Current Iteration is 100 | MSE: 2.748228258949598
Current Iteration is 110 | MSE: 2.7482272895884785
Current Iteration is 120 | MSE: 2.748226771989656
Current Iteration is 130 | MSE: 2.7482264950904147
Current Iteration is 140 | MSE: 2.748226346807017
Current Iteration is 150 | MSE: 2.7482262673557276
Current Iteration is 160 | MSE: 2.7482262247726283
Current Iteration is 170 | MSE: 2.748226201945986
Current Iteration is 180 | MSE: 2.748226189708754
Current Iteration is 190 | MSE: 2.748226183148152
Current Iteration is 200 | MSE: 2.7482261796307976


In [547]:
validMSE

2.7482261755655193

In [373]:
### Question 8

In [500]:
def iterate(lamb, max_iteration=50):
    mse = 0
    betaU = {}
    betaI = {}
    for u in hoursPerUser:
        betaU[u] = 0

    for g in hoursPerItem:
        betaI[g] = 0
    for iter in range(max_iteration):
        update_alpha()
        update_betaU(lamb)
        update_betaI(lamb)
        curr_mse = MSE()
        if mse == 0 or curr_mse <= mse:
            mse = curr_mse
        else:
            print(f"Current Lambda is {lamb} | MSE: {curr_mse}")
            break
        if iter == 29:
            print(f"Current Lambda is {lamb} | MSE: {curr_mse}")
    return mse

In [526]:
# Better lambda...
bestValidMSE = None
bestLamb = 0
for lamb in np.arange(0.001, 4.71, 0.01):
    hours = [r[2]['hours_transformed'] for r in allHours]
    globalAverage = sum(hours) * 1.0 / len(hours)
    hoursPerUser, hoursPerItem,Rui = {}, {}, {}
    for u,g,d in allHours:
        if u not in hoursPerUser:
            hoursPerUser[u] = [(g, d['hours_transformed'])]
        else:
            hoursPerUser[u].append((g, d['hours_transformed']))
        if g not in hoursPerItem:
            hoursPerItem[g] = [(u, d['hours_transformed'])]
        else:
            hoursPerItem[g].append((u, d['hours_transformed']))
        Rui[(u,g)] = d['hours_transformed']
    betaU = {}
    betaI = {}
    for u in hoursPerUser:
        betaU[u] = 0

    for g in hoursPerItem:
        betaI[g] = 0
    validMSE = iterate(lamb, max_iteration=100)
    if bestValidMSE == None or validMSE < bestValidMSE:
        bestValidMSE = validMSE
        bestLamb = lamb

Current Lambda is 0.001 | MSE: 2.7217719627934
Current Lambda is 0.011 | MSE: 2.721777121993741
Current Lambda is 0.020999999999999998 | MSE: 2.721782656903129
Current Lambda is 0.030999999999999996 | MSE: 2.7217885657249172
Current Lambda is 0.040999999999999995 | MSE: 2.721794846681891
Current Lambda is 0.05099999999999999 | MSE: 2.7218014980154677


KeyboardInterrupt: 

In [507]:
bestValidMSE, bestLamb # Fine-tune lambda

(2.746672815636831, 4.8)

(2.990624596408338, 4.949999999999997)

(3.057123229940671, 5.089999999999994)

In [548]:
predictions = open("predictions_Hours.csv", 'w')
for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    
    if (u,g) in Rui:
        predictions.write(u + ',' + g + ',' + str(Rui[(u,g)]) + '\n')
    if u in betaU and g in betaI:
        predictions.write(u + ',' + g + ',' + str(alpha + betaU[u] + betaI[g]) + '\n')
    else:
        predictions.write(u + ',' + g + ',' + str(0) + '\n')

predictions.close()