In [2]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model

In [3]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [4]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [5]:
def readJSON(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

In [6]:
answers = {}

In [7]:
# Some data structures that will be useful

In [8]:
allHours = []
for l in readJSON("train.json.gz"):
    allHours.append(l)

In [9]:
hoursTrain = allHours[:165000]
hoursValid = allHours[165000:]
hoursPerUser = defaultdict(list)
hoursPerItem = defaultdict(list)
for u,g,d in hoursTrain:
    r = d['hours_transformed']
    hoursPerUser[u].append((g,r))
    hoursPerItem[g].append((u,r))

In [20]:
##################################################
# Play prediction                                #
##################################################

In [10]:
# From baseline code
gameCount = defaultdict(int)
totalPlayed = 0

for u,g,_ in hoursTrain:
    gameCount[g] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/2: break

In [11]:
# Generate a negative set

userSet = set()
gameSet = set()
playedSet = set()

for u,g,d in allHours:
    userSet.add(u)
    gameSet.add(g)
    playedSet.add((u,g))

lUserSet = list(userSet)
lGameSet = list(gameSet)

notPlayed = set()
for u,g,d in hoursValid:
    #u = random.choice(lUserSet)
    g = random.choice(lGameSet)
    while (u,g) in playedSet or (u,g) in notPlayed:
        g = random.choice(lGameSet)
    notPlayed.add((u,g))

playedValid = set()
for u,g,r in hoursValid:
    playedValid.add((u,g))

In [12]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer/denom
    return 0

In [24]:
### Question 1

In [25]:
# Evaluate baseline strategy

correct = 0
p0, p1 = 0,0
for (label,sample) in [(1, playedValid), (0, notPlayed)]:
    for (u,b) in sample:
        pred = 0
        if b in return1:
            pred = 1
        if pred == label:
            correct += 1

In [26]:
correct / (len(playedValid) + len(notPlayed))

0.6796679667966796

In [27]:
answers['Q1'] = correct / (len(playedValid) + len(notPlayed))

In [28]:
assertFloat(answers['Q1'])

In [29]:
### Question 2

In [13]:
# Improved strategy

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > 1.5 * totalPlayed/2: break

In [31]:
# Evaluate baseline strategy

correct = 0
p0, p1 = 0,0
for (label,sample) in [(1, playedValid), (0, notPlayed)]:
    for (u,b) in sample:
        pred = 0
        if b in return1:
            pred = 1
        if pred == label:
            correct += 1

In [32]:
correct / (len(playedValid) + len(notPlayed))

0.6949194919491949

In [33]:
answers['Q2'] = [1.5 * totalPlayed/2, correct / (len(playedValid) + len(notPlayed))]

In [34]:
assertFloatList(answers['Q2'], 2)

In [35]:
### Question 3/4

In [36]:
# Slow implementation, could easily be improved following the code from Lecture 7

correct = 0
p0, p1 = 0,0
for (label,sample) in [(1, playedValid), (0, notPlayed)]:
    for (u,g) in sample:
        maxSim = 0
        users = set(hoursPerItem[g])
        for g2,_ in hoursPerUser[u]:
            sim = Jaccard(users,set(hoursPerItem[g2]))
            if sim > maxSim:
                maxSim = sim
        pred = 0
        if maxSim > 0.025 or len(hoursPerItem[g]) > 60:
            pred = 1
            p1 += 1
        else:
            p0 += 1
        if pred == label:
            correct += 1

In [37]:
correct / (len(playedValid) + len(notPlayed))

0.6992199219921992

In [38]:
answers['Q3'] = correct / (len(playedValid) + len(notPlayed))
answers['Q4'] = correct / (len(playedValid) + len(notPlayed))

In [28]:
assertFloat(answers['Q3'])
assertFloat(answers['Q4'])

In [15]:
# pred_similarities = defaultdict(list)

In [14]:
gameCount = defaultdict(int)
totalPlayed = 0

for u,g,_ in hoursTrain:
    gameCount[g] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/1.5: break

In [16]:
pred_similarities = []

In [17]:
for l in open("pairs_Played.csv"):
    if l.startswith("userID"):
        continue
    u,g = l.strip().split(',')
    maxSim = 0
    users = set(hoursPerItem[g])
    for g2,_ in hoursPerUser[u]:
        sim = Jaccard(users,set(hoursPerItem[g2]))
        if sim > maxSim:
            maxSim = sim
    pred_similarities.append((u,g,maxSim))

In [19]:
for key in pred_similarities:
    pred_similarities.sort(key = lambda x: x[2])
    pred_similarities.reverse()

In [20]:
pred_similarities

[('u04893836', 'g83216730', 0.058823529411764705),
 ('u39462514', 'g35557126', 0.05128205128205128),
 ('u00874815', 'g32560983', 0.04),
 ('u42908769', 'g82181072', 0.038461538461538464),
 ('u42908769', 'g75142381', 0.03773584905660377),
 ('u32377855', 'g34813934', 0.037037037037037035),
 ('u72952508', 'g99400209', 0.03571428571428571),
 ('u23800330', 'g17749950', 0.03333333333333333),
 ('u56316623', 'g12079363', 0.03225806451612903),
 ('u97764793', 'g45880815', 0.03225806451612903),
 ('u28227898', 'g55933232', 0.03225806451612903),
 ('u51581636', 'g40124000', 0.029411764705882353),
 ('u93124412', 'g59639854', 0.029411764705882353),
 ('u82904846', 'g44790145', 0.029411764705882353),
 ('u05202684', 'g31156952', 0.02857142857142857),
 ('u80947360', 'g82729324', 0.02857142857142857),
 ('u46383519', 'g64029720', 0.027777777777777776),
 ('u64671579', 'g65538651', 0.027777777777777776),
 ('u97774562', 'g30998929', 0.027777777777777776),
 ('u54177102', 'g97014047', 0.027777777777777776),
 ('u5

In [21]:
predictions = open("predictions_Played.csv", 'w')
for l in open("pairs_Played.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    pred = 0
    if g in [pair[1] for pair in pred_similarities[:len(pred_similarities)//2]]:
        pred = 1
    # elif g in return1:
    #     pred = 1
    # elif len(hoursPerItem[g]) > 60:
    #     pred = 1
    _ = predictions.write(u + ',' + g + ',' + str(pred) + '\n')

predictions.close()

In [30]:
answers['Q5'] = "I confirm that I have uploaded an assignment submission to gradescope"

In [31]:
##################################################
# Hours played prediction                        #
##################################################

In [32]:
trainHours = [r[2]['hours_transformed'] for r in hoursTrain]
globalAverage = sum(trainHours) * 1.0 / len(trainHours)

In [33]:
validMSE = 0
for u,g,d in hoursValid:
    r = d['hours_transformed']
    se = (r - globalAverage)**2
    validMSE += se

validMSE /= len(hoursValid)

print("Validation MSE (average only) = " + str(validMSE))

Validation MSE (average only) = 5.316020858088501


In [34]:
### Question 6

In [35]:
betaU = {}
betaI = {}
for u in hoursPerUser:
    betaU[u] = 0

for g in hoursPerItem:
    betaI[g] = 0

In [36]:
alpha = globalAverage # Could initialize anywhere, this is a guess

In [37]:
def iterate(lamb):
    newAlpha = 0
    for u,g,d in hoursTrain:
        r = d['hours_transformed']
        newAlpha += r - (betaU[u] + betaI[g])
    alpha = newAlpha / len(hoursTrain)
    for u in hoursPerUser:
        newBetaU = 0
        for g,r in hoursPerUser[u]:
            newBetaU += r - (alpha + betaI[g])
        betaU[u] = newBetaU / (lamb + len(hoursPerUser[u]))
    for g in hoursPerItem:
        newBetaI = 0
        for u,r in hoursPerItem[g]:
            newBetaI += r - (alpha + betaU[u])
        betaI[g] = newBetaI / (lamb + len(hoursPerItem[g]))
    mse = 0
    for u,g,d in hoursTrain:
        r = d['hours_transformed']
        prediction = alpha + betaU[u] + betaI[g]
        mse += (r - prediction)**2
    regularizer = 0
    for u in betaU:
        regularizer += betaU[u]**2
    for g in betaI:
        regularizer += betaI[g]**2
    mse /= len(hoursTrain)
    return mse, mse + lamb*regularizer

In [38]:
mse,objective = iterate(1)
newMSE,newObjective = iterate(1)
iterations = 2

In [39]:
while iterations < 10 or objective - newObjective > 0.01:
    mse, objective = newMSE, newObjective
    newMSE, newObjective = iterate(1)
    iterations += 1
    print("Objective after "
        + str(iterations) + " iterations = " + str(newObjective))
    print("MSE after "
        + str(iterations) + " iterations = " + str(newMSE))

Objective after 3 iterations = 6916.291258826528
MSE after 3 iterations = 2.756414053005335
Objective after 4 iterations = 6935.23715550776
MSE after 4 iterations = 2.755604333875777
Objective after 5 iterations = 6924.6062017768245
MSE after 5 iterations = 2.755486661616215
Objective after 6 iterations = 6905.833993834695
MSE after 6 iterations = 2.755457716152593
Objective after 7 iterations = 6885.30918519728
MSE after 7 iterations = 2.7554456693587497
Objective after 8 iterations = 6864.742698779786
MSE after 8 iterations = 2.7554377123048535
Objective after 9 iterations = 6844.576219662253
MSE after 9 iterations = 2.755430906995692
Objective after 10 iterations = 6824.91582215507
MSE after 10 iterations = 2.7554245073468486
Objective after 11 iterations = 6805.779255606049
MSE after 11 iterations = 2.7554183159504917
Objective after 12 iterations = 6787.161156344411
MSE after 12 iterations = 2.755412278370264
Objective after 13 iterations = 6769.050270855048
MSE after 13 iteration

Objective after 90 iterations = 6209.249427881521
MSE after 90 iterations = 2.755187092074258
Objective after 91 iterations = 6207.318844162245
MSE after 91 iterations = 2.7551859380875623
Objective after 92 iterations = 6205.444365248543
MSE after 92 iterations = 2.755184806169715
Objective after 93 iterations = 6203.6243950805465
MSE after 93 iterations = 2.7551836958510476
Objective after 94 iterations = 6201.857382185177
MSE after 94 iterations = 2.755182606672543
Objective after 95 iterations = 6200.141818460075
MSE after 95 iterations = 2.7551815381859104
Objective after 96 iterations = 6198.4762379894
MSE after 96 iterations = 2.755180489953302
Objective after 97 iterations = 6196.859215890833
MSE after 97 iterations = 2.7551794615468492
Objective after 98 iterations = 6195.289367192376
MSE after 98 iterations = 2.7551784525487086
Objective after 99 iterations = 6193.765345739354
MSE after 99 iterations = 2.7551774625507677
Objective after 100 iterations = 6192.28584312967
MSE a

Objective after 176 iterations = 6148.8409431136315
MSE after 176 iterations = 2.7551367224421464
Objective after 177 iterations = 6148.702537764905
MSE after 177 iterations = 2.755136470534834
Objective after 178 iterations = 6148.568557839423
MSE after 178 iterations = 2.7551362227094485
Objective after 179 iterations = 6148.438870841919
MSE after 179 iterations = 2.755135978893863
Objective after 180 iterations = 6148.3133481268615
MSE after 180 iterations = 2.755135739017546
Objective after 181 iterations = 6148.191864788228
MSE after 181 iterations = 2.7551355030114055
Objective after 182 iterations = 6148.074299552734
MSE after 182 iterations = 2.755135270807567
Objective after 183 iterations = 6147.960534675639
MSE after 183 iterations = 2.7551350423393868
Objective after 184 iterations = 6147.850455839888
MSE after 184 iterations = 2.755134817541883
Objective after 185 iterations = 6147.743952057975
MSE after 185 iterations = 2.755134596351097
Objective after 186 iterations = 6

In [40]:
validMSE = 0
for u,g,d in hoursValid:
    r = d['hours_transformed']
    bu = 0
    bi = 0
    if u in betaU:
        bu = betaU[u]
    if g in betaI:
        bi = betaI[g]
    prediction = alpha + bu + bi
    validMSE += (r - prediction)**2

validMSE /= len(hoursValid)
print("Validation MSE = " + str(validMSE))

Validation MSE = 3.3620657269506733


In [41]:
answers['Q6'] = validMSE

In [42]:
assertFloat(answers['Q6'])

In [43]:
### Question 7

In [44]:
betaUs = [(betaU[u], u) for u in betaU]
betaIs = [(betaI[i], i) for i in betaI]
betaUs.sort()
betaIs.sort()

print("Maximum betaU = " + str(betaUs[-1][1]) + ' (' + str(betaUs[-1][0]) + ')')
print("Maximum betaI = " + str(betaIs[-1][1]) + ' (' + str(betaIs[-1][0]) + ')')
print("Minimum betaU = " + str(betaUs[0][1]) + ' (' + str(betaUs[0][0]) + ')')
print("Minimum betaI = " + str(betaIs[0][1]) + ' (' + str(betaIs[0][0]) + ')')

Maximum betaU = u60898505 (5.828316739259239)
Maximum betaI = g17604638 (5.495973739724736)
Minimum betaU = u13037838 (-3.0057870148761894)
Minimum betaI = g84397720 (-2.809328679823356)


In [45]:
answers['Q7'] = [betaUs[-1][0], betaUs[0][0], betaIs[-1][0], betaIs[0][0]]

In [46]:
answers['Q7']

[5.828316739259239, -3.0057870148761894, 5.495973739724736, -2.809328679823356]

In [47]:
assertFloatList(answers['Q7'], 4)

In [48]:
### Question 8

In [49]:
# Better lambda...

iterations = 1
while iterations < 10 or objective - newObjective > 0.01:
    mse, objective = newMSE, newObjective
    newMSE, newObjective = iterate(5)
    iterations += 1
    print("Objective after " + str(iterations) + " iterations = " + str(newObjective))
    print("MSE after " + str(iterations) + " iterations = " + str(newMSE))

Objective after 2 iterations = 23723.40581939076
MSE after 2 iterations = 2.7788624145856393
Objective after 3 iterations = 23510.585432916247
MSE after 3 iterations = 2.77950918417825
Objective after 4 iterations = 23487.108448891875
MSE after 4 iterations = 2.779632986564118
Objective after 5 iterations = 23482.603926859705
MSE after 5 iterations = 2.7796579991997605
Objective after 6 iterations = 23481.074962137227
MSE after 6 iterations = 2.7796657598050394
Objective after 7 iterations = 23480.130273496496
MSE after 7 iterations = 2.7796701526871908
Objective after 8 iterations = 23479.338493177987
MSE after 8 iterations = 2.7796737556337834
Objective after 9 iterations = 23478.61426569989
MSE after 9 iterations = 2.7796770737497773
Objective after 10 iterations = 23477.938819973006
MSE after 10 iterations = 2.7796802099553215
Objective after 11 iterations = 23477.30667015269
MSE after 11 iterations = 2.7796831871553858
Objective after 12 iterations = 23476.715002575234
MSE after 1

In [50]:
alpha_ = alpha

In [51]:
validMSE = 0
for u,g,d in hoursValid:
    r = d['hours_transformed']
    bu = 0
    bi = 0
    if u in betaU:
        bu = betaU[u]
    if g in betaI:
        bi = betaI[g]
    prediction = alpha + bu + bi
    validMSE += (r - prediction)**2

validMSE /= len(hoursValid)
print("Validation MSE = " + str(validMSE))

Validation MSE = 3.3246506094357864


In [52]:
answers['Q8'] = (5.0, validMSE)

In [53]:
assertFloatList(answers['Q8'], 2)

In [54]:
predictions = open("HWpredictions_Hours.csv", 'w')
for l in open("/home/julian/Downloads/assignment1/pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    bu = 0
    bi = 0
    if u in betaU:
        bu = betaU[u]
    if g in betaI:
        bi = betaI[g]
    _ = predictions.write(u + ',' + g + ',' + str(alpha + bu + bi) + '\n')

predictions.close()

In [55]:
f = open("answers_hw3.txt", 'w')
f.write(str(answers) + '\n')
f.close()