In [2]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model
import tensorflow as tf
from surprise.model_selection import train_test_split

In [3]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [4]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [5]:
def readJSON(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

In [7]:
# Some data structures that will be useful

In [8]:
allHours = []
for l in readJSON("train.json.gz"):
    allHours.append(l)

In [13]:
hoursTrain = allHours[:165000]
hoursValid = allHours[165000:]

In [8]:
##################################################
# Play prediction                                #
##################################################

In [9]:
# Any other preprocessing...
users = set() # maintain a set of unique users
games = set() # maintain a set of unique games
pos_pairs = []
for l in readGz("train.json.gz"):
    u,g = l['userID'], l['gameID']
    users.add(u)
    games.add(g)
    pos_pairs += [(u, g)]

In [10]:
train_data = pos_pairs[:165000]
valid_data = pos_pairs[165000:] # because pos_pairs has length 175000

In [11]:
len(users), len(games)

(6710, 2437)

In [14]:
# Randomly sample games that weren't played
neg_pairs = set()
len_valid = len(hoursValid)
while True:
    user = random.sample(users, 1)[0]
    game = random.sample(games, 1)[0]
    if (user, game) not in pos_pairs:
        neg_pairs.add((user, game))
    if len(neg_pairs) == len_valid:
        break

since Python 3.9 and will be removed in a subsequent version.
  user = random.sample(users, 1)[0]
since Python 3.9 and will be removed in a subsequent version.
  game = random.sample(games, 1)[0]


In [15]:
len(neg_pairs), len(train_data)

(9999, 165000)

In [16]:
userPerGame = defaultdict(set) # Maps a game to the users who played it
gamePerUser = defaultdict(set) # Maps a user to the game that they played
hoursDict = {} # To retrieve an hour for a specific user/game pair

for d in hoursTrain:
    user,game = d[0], d[1]
    userPerGame[game].add(user)
    gamePerUser[user].add(game)
    hoursDict[(user, game)] = d[2]['hours']

In [17]:
userAverages = {}
gameAverages = {}

for u in gamePerUser:
    rs = [hoursDict[(u,g)] for g in gamePerUser[u]]
    userAverages[u] = sum(rs) / len(rs)

for g in userPerGame:
    rs = [hoursDict[(u,g)] for u in userPerGame[g]]
    gameAverages[g] = sum(rs) / (len(rs))

In [28]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for d in hoursTrain:
    user,game = d[0], d[1]
    reviewsPerUser[user].append(d)
    reviewsPerItem[game].append(d)

In [19]:
reviewsPerUser["u05450000"][0]

('u05450000',
 'g52077802',
 {'hours': 16.8,
  'gameID': 'g52077802',
  'hours_transformed': 4.153805336079036,
  'early_access': False,
  'date': '2011-05-28',
  'text': 'A poorly optimized and poorly realized piece of♥♥♥♥♥♥ Stay clear of this and all future Splash Damage products.',
  'userID': 'u05450000'})

In [38]:
##################################################
# Hours played prediction                        #
##################################################

In [22]:
trainHours = [r[2]['hours_transformed'] for r in hoursTrain]
globalAverage = sum(trainHours) * 1.0 / len(trainHours)

In [23]:
globalAverage

3.716088074007024

## Complete Latent Factor Model (one latent factor)

In [35]:
N = len(allHours)
nUsers = len(reviewsPerUser)
nItems = len(reviewsPerItem)
users = list(reviewsPerUser.keys())
items = list(reviewsPerItem.keys())

In [24]:
alpha = globalAverage

In [29]:
userBiases = defaultdict(float)
itemBiases = defaultdict(float)
userGamma = {}
itemGamma = {}

In [30]:
K = 2

In [32]:
for u in reviewsPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [33]:
for i in reviewsPerItem:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [100]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    # for u in users:
    #     userGamma[u] = theta[index:index+K]
    #     index += K
    # for i in items:
    #     itemGamma[i] = theta[index:index+K]
    #     index += K

In [101]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [102]:
def prediction(user, item):
    return alpha + userBiases[user] + itemBiases[item] 
# + inner(userGamma[user], itemGamma[item])

In [103]:
def cost(theta, labels, lamb1,lamb2):
    unpack(theta)
    predictions = [prediction(u, g) for u,g,d in hoursTrain]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb1*userBiases[u]**2
        # for k in range(K):
        #     cost += lamb2*userGamma[u][k]**2
    for i in items:
        cost += lamb1*itemBiases[i]**2
        # for k in range(K):
        #     cost += lamb2*itemGamma[i][k]**2
    return cost

In [104]:
def derivative(theta, labels, lamb1, lamb2):
    unpack(theta)
    N = len(allHours)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in reviewsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in reviewsPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for u,i,d in allHours:
        pred = prediction(u, i)
        diff = pred - d['hours_transformed']
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        # for k in range(K):
        #     dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
        #     dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb1*userBiases[u]
        # for k in range(K):
        #     dUserGamma[u][k] += 2*lamb2*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb1*itemBiases[i]
        # for k in range(K):
        #     dItemGamma[i][k] += 2*lamb2*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    # for u in users:
    #     dtheta += dUserGamma[u]
    # for i in items:
    #     dtheta += dItemGamma[i]
    return np.array(dtheta)

In [105]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [106]:
labels = [d['hours_transformed'] for u,g,d in hoursTrain]

In [107]:
alwaysPredictMean = [globalAverage for u,g,d in hoursTrain]

In [108]:
MSE(alwaysPredictMean, labels)

5.278030914752221

In [113]:
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                   [0.0]*(nUsers+nItems),  # Initialize beta
                              #      [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                             derivative, args = (labels, 4.959999999999999, 1))

MSE = 5.673572374847924
MSE = 5.399254102091392
MSE = 5.302565861430607
MSE = 5.276367042721776
MSE = 5.276531494018957
MSE = 5.276531728918435
MSE = 5.276531669544021


(array([ 3.71767716e+00,  1.98642409e-05, -1.69180626e-05, ...,
        -1.20800926e-05, -2.98644891e-05, -3.97332784e-05]),
 5.277284486387923,
 {'grad': array([-1.93580857e-06,  3.54593482e-09, -2.03281132e-09, ...,
         -1.46858070e-09, -7.80169586e-09, -1.29338515e-08]),
  'task': 'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
  'funcalls': 7,
  'nit': 6,
  'warnflag': 0})

In [96]:
predictions = [prediction(u,g) for u,g,d in hoursValid]

In [97]:
labels_valid = [d['hours_transformed'] for u,g,d in hoursValid]

In [98]:
MSE(predictions,labels_valid)

2.727273307384981

In [99]:
hoursPerUser, hoursPerItem,Rui = {}, {}, {}
for u,g,d in hoursTrain:
    if u not in hoursPerUser:
        hoursPerUser[u] = [(g, d['hours_transformed'])]
    else:
        hoursPerUser[u].append((g, d['hours_transformed']))
    if g not in hoursPerItem:
        hoursPerItem[g] = [(u, d['hours_transformed'])]
    else:
        hoursPerItem[g].append((u, d['hours_transformed']))
    Rui[(u,g)] = d['hours_transformed']

In [72]:
len(hoursPerItem)

2437

In [73]:
betaU = {}
betaI = {}
for u in hoursPerUser:
    betaU[u] = 0

for g in hoursPerItem:
    betaI[g] = 0

In [74]:
alpha = globalAverage # Could initialize anywhere, this is a guess

In [75]:
def update_alpha():
    global alpha
    num = sum(Rui[(u,g)]- (betaU[u] + betaI[g]) for u,g,_ in hoursTrain)
    denom = len(hoursTrain)
    alpha = num / denom

In [76]:
def update_betaU(lamb):
    global alpha
    for user in hoursPerUser:
        num = sum(Rui[(user,g)]-(alpha+betaI[g]) for g,t in hoursPerUser[user])
        denom = lamb + len(hoursPerUser[user])
        betaU[user] = num / denom

In [77]:
def update_betaI(lamb):
    global alpha
    for item in hoursPerItem:
        num = sum(Rui[(u,item)]-(alpha+betaU[u]) for u,t in hoursPerItem[item])
        denom = lamb + len(hoursPerItem[item])
        betaI[item] = num / denom

In [78]:
def predict(user, item):
    global alpha
    if user not in hoursPerUser and item not in hoursPerItem:
        return alpha
    if user in hoursPerUser and item not in hoursPerItem:
        return alpha + betaU[user]
    if user not in hoursPerUser and item in hoursPerItem:
        return alpha + betaI[item]
    return alpha + betaU[user] + betaI[item]

In [79]:
def MSE():
    mse = sum((_['hours_transformed']-predict(u,i))**2 for u,i,_ in hoursValid) / len(hoursValid)
    return mse

In [80]:
def iterate(lamb, max_iteration=1000):
    mse = 0
    for iter in range(max_iteration):
        update_alpha()
        update_betaU(lamb)
        update_betaI(lamb)
        curr_MSE = MSE()
        if iter % 10 == 9:
            print(f"Current Iteration is {iter+1} | MSE: {curr_MSE}")
        if mse == 0 or curr_MSE <= mse:
            mse = curr_MSE
        else:
            print(f"End Iteration is {iter+1} | MSE: {curr_MSE}")
            break
    return curr_MSE

In [81]:
validMSE = iterate(lamb=4.959999999999999)

Current Iteration is 10 | MSE: 2.992272655157726
Current Iteration is 20 | MSE: 2.9912393881609276
Current Iteration is 30 | MSE: 2.990877755823751
Current Iteration is 40 | MSE: 2.9907386098254003
Current Iteration is 50 | MSE: 2.990679621491325
Current Iteration is 60 | MSE: 2.990652571558784
Current Iteration is 70 | MSE: 2.9906394816398763
Current Iteration is 80 | MSE: 2.9906329353147316
Current Iteration is 90 | MSE: 2.9906295995303536
Current Iteration is 100 | MSE: 2.990627882220814
Current Iteration is 110 | MSE: 2.990626993270169
Current Iteration is 120 | MSE: 2.990626531779997
Current Iteration is 130 | MSE: 2.9906262918381854
Current Iteration is 140 | MSE: 2.9906261669866394
Current Iteration is 150 | MSE: 2.990626101994436
Current Iteration is 160 | MSE: 2.9906260681550467
Current Iteration is 170 | MSE: 2.990626050533969
Current Iteration is 180 | MSE: 2.9906260413576597
Current Iteration is 190 | MSE: 2.9906260365788797
Current Iteration is 200 | MSE: 2.990626034090178

In [46]:
validMSE

3.0203673766083035

In [55]:
### Question 8

In [58]:
def iterate(lamb, max_iteration=50):
    mse = 0
    for iter in range(max_iteration):
        update_alpha()
        update_betaU(lamb)
        update_betaI(lamb)
        curr_mse = MSE()
        if mse == 0 or curr_mse <= mse:
            mse = curr_mse
        else:
            print(f"Current Lambda is {lamb} | MSE: {curr_mse}")
            break
        if iter == 29:
            print(f"Current Lambda is {lamb} | MSE: {curr_mse}")
    return mse

In [64]:
# Better lambda...
bestValidMSE = None
bestLamb = 0
for lamb in np.arange(4.9, 5.11, 0.01):
    validMSE = iterate(lamb, max_iteration=30)
    if bestValidMSE == None or validMSE < bestValidMSE:
        bestValidMSE = validMSE
        bestLamb = lamb

Current Lambda is 4.9 | MSE: 2.9906283917334524
Current Lambda is 4.91 | MSE: 2.990627471979278
Current Lambda is 4.92 | MSE: 2.9906268050468863
Current Lambda is 4.93 | MSE: 2.9906263109404017
Current Lambda is 4.9399999999999995 | MSE: 2.9906259852163006
Current Lambda is 4.949999999999999 | MSE: 2.990625825237173
Current Lambda is 4.959999999999999 | MSE: 2.990625828587001
Current Lambda is 4.969999999999999 | MSE: 2.990625993031822
Current Lambda is 4.979999999999999 | MSE: 2.9906263165131315
Current Lambda is 4.989999999999998 | MSE: 2.9906267971386855
Current Lambda is 4.999999999999998 | MSE: 2.9906274331714715
Current Lambda is 5.009999999999998 | MSE: 2.990628223017745
Current Lambda is 5.019999999999998 | MSE: 2.9906291652147052
Current Lambda is 5.029999999999998 | MSE: 2.9906302584188142
Current Lambda is 5.039999999999997 | MSE: 2.9906315013943465
Current Lambda is 5.049999999999997 | MSE: 2.990632893002656
Current Lambda is 5.059999999999997 | MSE: 2.9906344321925604
Curr

In [85]:
bestValidMSE, bestLamb # Fine-tune lambda

(2.9906253154177134, 4.959999999999999)

In [80]:
predictions = open("predictions_Hours.csv", 'w')
for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    pred=prediction(u,g)
    
    if u in userBiases and g in itemBiases:
        predictions.write(u + ',' + g + ',' + str(pred) + '\n')
    else:
        predictions.write(u + ',' + g + ',' + str(0) + '\n')

predictions.close()