In [27]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model
import sys
from tqdm.notebook import tqdm

In [2]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [7]:
def readJSON(path):
    f = gzip.open(path, 'rt', encoding='utf-8')
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

In [9]:
# Some data structures that will be useful

In [10]:
allHours = []
for l in readJSON("./../data/train.json.gz"):
    allHours.append(l)

In [11]:
#print(len(allHours))
print(allHours[0])

('u70666506', 'g49368897', {'userID': 'u70666506', 'early_access': False, 'hours': 63.5, 'hours_transformed': 6.011227255423254, 'found_funny': 1, 'text': 'If you want to sit in queue for 10-20min and have 140 ping then this game is perfect for you :)', 'gameID': 'g49368897', 'user_id': '76561198030408772', 'date': '2017-05-20'})


In [12]:
hoursTrain = allHours[:165000]
hoursValid = allHours[165000:]

In [13]:
##################################################
# Play prediction                                #
##################################################

In [14]:
# Any other preprocessing...
gamesPerUser = defaultdict(set)
usersPerGame = defaultdict(set)

train_games = set()
playersPerGame = defaultdict(int)
negative_set = []
totalPlayed = 0

for user,game, data in hoursTrain:
    gamesPerUser[user].add(game)
    usersPerGame[game].add(user)
    train_games.add(game)
    playersPerGame[game] += 1
    totalPlayed+= 1

train_games = list(train_games)

for d in hoursValid:
    user = d[0]
    not_match = [d for d in train_games if d not in gamesPerUser[user]]
    rand = random.randint(1, len(not_match)) - 1
    game = not_match[rand]
    negative_set.append([user,game, None])

mostPopular = [(playersPerGame[x], x) for x in playersPerGame]
mostPopular.sort()
mostPopular.reverse()

In [15]:
totalPlayed

165000

In [16]:
# Any other preprocessing...
X_train = [len(d[2]['text']) for d in hoursTrain]

In [17]:
# Evaluate baseline strategy



In [18]:
# Get games that are the top percentage of games played
def pop_val(percentile):
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalPlayed*(percentile/100): break
    return return1

In [19]:
#len(pop_val(99))/len(mostPopular)

In [20]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer/denom

In [21]:
def mostSimilar(i, threshold, popular):
    # Iterate through all items j (other than i); compute Jaccard similarity; return most similar
    similarities = []
    game_pop = 0
    users = usersPerGame[i[1]]
    if (i[1] not in popular):
        return 0

    for j in gamesPerUser[i[0]]:  # For all games that the user played
        if i[0] == j:          # Dont compare i to i
            continue
        sim = Jaccard(users, usersPerGame[j])
        similarities.append((sim,j))    # Keep track of sim and the item that generated it
    similarities.sort(reverse=True)
    if (len(similarities) > 0 and ((similarities[0][0] + game_pop) >= threshold )):
        return 1
    else:
        return 0

In [22]:
#percentile = 70
#threshold = 0.03
def sim_comp(user, game, percentile=65, threshold=0.019):
    #print(str(percentile), str(threshold))
    popular = pop_val(percentile)
    return mostSimilar([user,game], threshold, popular)

In [23]:
# Testing

In [24]:
valid_set = [1 if (sim_comp(d[0], d[1],65,0.02) == 1) else 0 for d in hoursValid]
neg_set = [1 if (sim_comp(d[0], d[1],65,0.02) == 0) else 0 for d in negative_set]
precision_4 = (sum(valid_set) + sum(neg_set)) / (len(hoursValid) * 2)
print(precision_4)

0.7017201720172017


In [28]:
tester_file = open("test.txt", 'w')
for i in tqdm(range(41)):
    threshold = 0.00 + i * 0.001
    #for j in range(21):
    percentile = 65
    valid_set = [1 if (sim_comp(d[0], d[1], percentile, threshold) == 1) else 0 for d in hoursValid]
    neg_set = [1 if (sim_comp(d[0], d[1], percentile, threshold) == 0) else 0 for d in negative_set]
    precision_4 = (sum(valid_set) + sum(neg_set)) / (len(hoursValid) * 2)
    tester_file.write("Threshold: " + str(threshold) +" Percentile: " + str(percentile) +" Precision: "+ str(precision_4) + " \n")

tester_file.close()

  0%|          | 0/41 [00:00<?, ?it/s]

In [None]:
# Reporting results

In [40]:
predictions = open("predictions_Played.csv", 'w')
for l in open("./../data/pairs_Played.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    
    # Logic...
    pred = sim_comp(u,g)
    _ = predictions.write(u + ',' + g + ',' + str(pred) + '\n')

predictions.close()

In [41]:
##################################################
# Hours played prediction                        #
##################################################

In [42]:
hoursPerUser = defaultdict(list)
hoursPerItem = defaultdict(list)
hours_pergameuser = defaultdict(int)

for d in hoursTrain:
    hoursPerItem[d[1]].append(d[2]['hours_transformed'])
    hoursPerUser[d[0]].append(d[2]['hours_transformed'])
    hours_pergameuser[d[0], d[1]] = (d[2]['hours_transformed'])

In [43]:
trainHours = [r[2]['hours_transformed'] for r in hoursTrain]
globalAverage = sum(trainHours) * 1.0 / len(trainHours)
print(trainHours[0])
alpha = globalAverage # Could initialize anywhere, this is a guess

6.011227255423254


In [44]:
alpha = globalAverage
betaU = {}
betaI= {}
lowest = sys.float_info.max
for u in hoursPerUser:
    betaU[u] = 0

for g in hoursPerItem:
    betaI[g] = 0

In [45]:
def iterate(alpha, betaU, betaI, lamb):
    # Run alpha
    alpha = sum([hours_pergameuser[user,game] - (betaU[user] + betaI[game]) for  user,game,data in hoursTrain])
    alpha = alpha / len(hoursTrain)

    # Run betaU
    for user in gamesPerUser:
        betaU[user] = sum([hours_pergameuser[user, game] - (alpha + betaI[game]) for game in gamesPerUser[user]])
        betaU[user] = betaU[user] / (len(gamesPerUser[user]) + lamb)

    # Run betaI
    for game in usersPerGame:
        betaI[game] = sum(hours_pergameuser[user, game] - (alpha + betaU[user]) for user in usersPerGame[game])
        betaI[game] = betaI[game] / (len(usersPerGame[game]) + lamb)

    #mse = 0
    #for u,g,d in hoursTrain:
    #    r = d['hours_transformed']
    #    prediction = alpha + betaU[u] + betaI[g]
    #    mse += (r - prediction)**2
    #regularizer = 0
    #for u in betaU:
    #    regularizer += betaU[u]**2
    #for g in betaI:
    #    regularizer += betaI[g]**2
    #mse /= len(hoursTrain)
    #return mse, mse + lamb*regularizer
    # Get error
    ret = 0
    for d in hoursTrain:
        ret += alpha + betaU[d[0]] + betaI[d[1]] - hours_pergameuser[d[0], d[1]]
    for u in betaU:
        ret += betaU[u]**2
    for i in betaI:
        ret += betaI[i]**2
    return ret

In [46]:
mse,objective = iterate(alpha,betaU, betaI, 5)
newMSE,newObjective = iterate(alpha,betaU, betaI, 5)
iterations = 2

TypeError: cannot unpack non-iterable float object

In [None]:
while iterations < 10 or objective - newObjective > 0.01:
    mse, objective = newMSE, newObjective
    newMSE, newObjective = iterate(alpha,betaU, betaI, 5)
    iterations += 1
    print("Objective after "
        + str(iterations) + " iterations = " + str(newObjective))
    print("MSE after "
        + str(iterations) + " iterations = " + str(newMSE))

In [39]:
best_betaU = betaU.copy()
best_betaI = betaI.copy()
best_alpha = alpha
lamda_q8 = 4.0
for i in tqdm(range(500)):
    train_val = iterate(alpha, betaU, betaI, lamda_q8)
    #print(train_val)
    if (train_val < lowest):
        best_alpha = alpha
        best_betaU  = betaU.copy()
        best_betaI = betaI.copy()
        lowest = train_val 

  0%|          | 0/500 [00:00<?, ?it/s]

In [47]:
betaI = best_betaI
betaU = best_betaU

In [48]:
def get_ui(user, game):
    return [betaU[user], betaI[game]]

In [49]:
# Testing

In [50]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [53]:
pred = []
label = []
for d in hoursValid:
    label.append(d[2]['hours_transformed'])
    pred.append(alpha + betaU[d[0]] + betaI[d[1]])

validMSE = MSE(pred,label)
validMSE

3.334494693630515

In [54]:
predictions = open("predictions_Hours.csv", 'w')
for l in open("./../data/pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    
    # Logic...
    bu, bi = get_ui(u,g)
    _ = predictions.write(u + ',' + g + ',' + str(alpha + bu + bi) + '\n')

predictions.close()