# Assignment 1  

### Import data

In [1]:
import gzip
from collections import defaultdict
import numpy as np
import random
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [2]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [4]:
def readJSON(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

In [5]:
# Some data structures that will be useful

In [6]:
allHours = []
for l in readJSON("./Data/train.json.gz"):
    allHours.append(l)

### Partition data 

In [7]:
hoursTrain = allHours[:165000]
hoursValid = allHours[165000:]
hoursValid[0]

('u00914251',
 'g61913894',
 {'hours': 17.1,
  'text': 'This is the perfect gift for my friend!!!',
  'gameID': 'g61913894',
  'hours_transformed': 4.177917792195843,
  'early_access': False,
  'user_id': '76561198116282623',
  'date': '2017-12-24',
  'userID': 'u00914251'})

### Part 1

In [8]:
# Any other preprocessing...
gamesPerUser = {}
UserPerGame = {}
for u,g,d in hoursTrain:
    if u not in gamesPerUser:
        gamesPerUser[u] = [g]
    else:
        gamesPerUser[u].append(g)
    if g not in UserPerGame:
        UserPerGame[g] = [u]
    else:
        UserPerGame[g].append(u)

### Adding negative pairs

In [9]:
# Define a function to randomly select a game that a user hasn't played
def ranGame(user):
    '''random game user hasnt played'''
    ran = random.choice(list(UserPerGame.keys()))
    
    # Check if the user hasn't played any games
    if user not in gamesPerUser:
        return ran

    # Check if the user has played all the games
    if len(gamesPerUser[user]) == len(UserPerGame):
        print('all games played')
        return ran
    
    # Keep selecting a random game until it is one that the user hasn't played
    while ran in gamesPerUser[user]:
        ran = random.choice(list(UserPerGame.keys()))
    return ran
    
# Create a new list to store the modified validation hours
newHoursValid = []
for u,g,d in hoursValid:
    newHoursValid.append((u,g,1))
    newHoursValid.append((u,ranGame(u),0))

# Create a new list to store the modified training hours
newHoursTrain = []
for u,g,d in hoursTrain:
    newHoursTrain.append((u,g,1))
    newHoursTrain.append((u,ranGame(u),0))


### Compute most popular

In [10]:
# Initialize a defaultdict to count the number of times each game is played
gameCount = defaultdict(int)
# Initialize a variable to keep track of the total number of games played
totalPlayed = 0

# Iterate over the training hours
for user, game, _ in hoursTrain:
  # Increment the count for the current game
  gameCount[game] += 1
  # Increment the total number of games played
  totalPlayed += 1

# Create a list of tuples containing the game count and game ID
mostPopular = [(gameCount[x], x) for x in gameCount]
# Sort the list in descending order based on the game count
mostPopular.sort(reverse=True)

# Create a set to store the most popular games
return1 = set()
# Initialize a variable to keep track of the cumulative game count
count = 0

# Iterate over the sorted list of most popular games
for ic, i in mostPopular:
  # Add the game ID to the set of most popular games
  return1.add(i)
  # Increment the cumulative game count
  count += ic
  # Check if the cumulative game count exceeds the threshold (68.6868% of total games played)
  if count > totalPlayed * 0.686868686868687:
    break


### similarity and training functions

In [25]:
# Function to calculate Jaccard similarity between two sets
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))  # Calculate the intersection of the two sets
    denom = len(s1.union(s2))  # Calculate the union of the two sets
    if denom == 0:
        return 0
    return numer / denom  # Return the Jaccard similarity coefficient

# Function to calculate the similarity between a user and a game
def similar(u, g, s=1):
    siml = []
    for i in gamesPerUser[u]:
        if i == g:
            continue
        if g in return1:
            siml.append(Jaccard(set(UserPerGame[g]), set(UserPerGame[i])) * s)
        else:
            siml.append(Jaccard(set(UserPerGame[g]), set(UserPerGame[i])))
    if len(siml) == 0:
        return 0

    siml.sort(reverse=True)

    if len(siml) // 2 > 1:
        siml = siml[:len(siml) // 2]
    else:
        return siml[0]

    simAvg = sum(siml) / len(siml)
    return simAvg

# Function to calculate the similarity between a user and all games in the training set
def getTrainSim(s=1):
    sim3 = []

    for u, g, _ in newHoursTrain:
        if u not in gamesPerUser:
            sim3.append(0)
        else:
            sim3.append(similar(u, g, s))
    return sim3

# Function to calculate the similarity between a user and all games in the validation set
def getValidSim(s=1):
    sim3 = []

    for u, g, _ in newHoursValid:
        if u not in gamesPerUser:
            sim3.append(0)
        else:
            sim3.append(similar(u, g, s))
    return sim3

# Function to train the model by finding the best threshold value
def trainModel(thres_start=0.009, thres_end=0.04, N_Tstep=500, s_start=1, s_end=1.1, N_Sstep=3):
    sim1 = getTrainSim()
    print('training model: Thresholds')

    best_threshold = 0
    best_acc = 0
    y_actual = [d for _, _, d in newHoursTrain]
    for thres in np.linspace(thres_start, thres_end, N_Tstep):
        y_pred = np.array(sim1) > thres
        acc = sum(y_pred == y_actual) / len(y_actual)
        if acc > best_acc:
            best_acc = acc
            best_threshold = thres

    print('best threshold: ', best_threshold, 'Acc: ', best_acc)

    return best_threshold

# Function to validate the model using a given threshold value
def validateModel(thres):
    sim1 = getValidSim()
    y_actual = [d for _, _, d in newHoursValid]
    y_pred = np.array(sim1) > thres
    acc = sum(y_pred == y_actual) / len(y_actual)
    print('Accuracy: ', acc)
    return acc

# Function to predict whether a user will play a game based on similarity and threshold values
def predict(u, g, s=1, thres=0.001):
    if u not in gamesPerUser:
        return 0
    sim = similar(u, g, s)
    if sim > thres:
        return 1
    return 0

### train threshold 

In [12]:
best_threshold =trainModel()
print( 'best_threshold: ', best_threshold)
print(validateModel(best_threshold))


training model: Thresholds
best threshold:  0.01744889779559118 Acc:  0.8678060606060606
best_threshold:  0.01744889779559118
Accuracy:  0.7075707570757076
0.7075707570757076


### Logistic regression

In [13]:
# Define a function to extract features for logistic regression
def feature(u, g):
    feat = [1, predict(u, g, 1, 0.0173)]  # Add bias term and similarity prediction
    feat.append(gameCount[g] / totalPlayed if g in return1 else 0)  # Add popularity feature
    return feat

# Create feature matrix X and target vector y for logistic regression
X = [feature(u, g) for u, g, _ in newHoursTrain]
y = [d for _, _, d in newHoursTrain]

# Initialize a logistic regression classifier with balanced class weights
clf = linear_model.LogisticRegression(class_weight='balanced')


In [26]:
print('Logistic Regression Model:')
clf.fit(X, y)
print('Logistic Regression Accuracy: ', clf.score(X,y))

Logistic Regression Model:
Logistic Regression Accuracy:  0.8676515151515152


In [28]:
# Create feature matrix X_valid and target vector Y_valid_actual for validation set
X_valid = [feature(u,g) for u,g,_ in newHoursValid]
Y_valid_actual = [d for _,_,d in newHoursValid]

# Print the accuracy of the logistic regression model on the validation set
print('Logistic Regression Accuracy on Validation Set: ', clf.score(X_valid,Y_valid_actual))

Logistic Regression Accuracy on Validation Set:  0.7086708670867087


### Solution test

In [16]:
predictions = open("predictions_Played.csv", 'w')
pred = []
for l in open("./Data/pairs_Played.csv"):
    if l.startswith("userID"):
        
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    pred = clf.predict([feature(u,g)])[0]
    _ = predictions.write(u + ',' + g + ',' + str(pred) + '\n')

predictions.close()

# Part 2: hours played prediction 
### Preprocessing 


In [17]:
# Create dictionaries to map user and item IDs to numerical indices
userIDs = {}
itemIDs = {}

# Create a list to store the interactions between users, items, and hours played
interactions = []

# Iterate over all the hours data
for u, g, d in allHours:
    # Map user IDs to numerical indices
    if u not in userIDs:
        userIDs[u] = len(userIDs)
    
    # Map item IDs to numerical indices
    if g not in itemIDs:
        itemIDs[g] = len(itemIDs)
    
    # Append the interaction to the list, including the transformed hours played
    interactions.append((u, g, d['hours_transformed']))


In [18]:
# Split the interactions into training and test sets
nTrain = int(len(interactions) * 0.75)
interactionsTrain = interactions[:nTrain]
interactionsTest = interactions[nTrain:]

# Extract the hours played from the training set
trainHours = [r for u,i,r in interactionsTrain]

# Calculate the global average of hours played
globalAverage = sum(trainHours) * 1.0 / len(trainHours)

In [19]:
# Create dictionaries to store the items per user and users per item
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)

# Create dictionaries to store the hours per user and hours per item
hoursPerUser = defaultdict(set)
hoursPerItem = defaultdict(set)

# Iterate over the training interactions
for u, i, r in interactionsTrain:
    # Append the item to the list of items for the current user
    itemsPerUser[u].append(i)
    
    # Append the user to the list of users for the current item
    usersPerItem[i].append(u)

# Iterate over the training interactions
for u, i, d in interactionsTrain:
    # Extract the hours from the interaction
    hours = d
    
    # Add the user and hours to the set of hours for the current item
    hoursPerItem[i].add((u, hours))
    
    # Add the item and hours to the set of hours for the current user
    hoursPerUser[u].add((i, hours))

### Functions

In [20]:
# Function to predict the rating for a user-item pair
def predict(u, g):
    global alpha
    global betaU
    global betaI
    
    bu = 0
    bi = 0
    if u in betaU:
        bu = betaU[u]
    if g in betaI:
        bi = betaI[g]
    return alpha + bu + bi

# Function to calculate the mean squared error (MSE) on the validation set
def mseValidate():
    mse = 0
    for u, g, r in interactionsTest:
        prediction = predict(u, g)
        mse += (r - prediction) ** 2
    mse /= len(interactionsTest)
    return mse

# Function to perform one iteration of the training process with separate regularization terms for betaU and betaI
def iterate(lami, lamu, lr):
    global alpha
    global betaU
    global betaI
    lalpha = alpha
    lbetaU = betaU.copy()
    lbetaI = betaI.copy()

    # Update alpha
    a = sum(r - (betaU[u] + betaI[i]) for u, i, r in interactionsTrain)
    b = a / len(interactionsTrain)
    alpha = lalpha + lr * (b - lalpha)

    # Update betaU
    for u in hoursPerUser:
        sm = sum(i[1] - (alpha + betaI[i[0]]) for i in hoursPerUser[u])
        sm = sm / (lamu + len(hoursPerUser[u]))
        betaU[u] = lbetaU[u] + lr * (sm - lbetaU[u])

    # Update betaI
    for u in hoursPerItem:
        sm = sum(i[1] - (alpha + betaU[i[0]]) for i in hoursPerItem[u])
        sm = sm / (lami + len(hoursPerItem[u]))
        betaI[u] = lbetaI[u] + lr * (sm - lbetaI[u])
        
    # Calculate MSE on the training set
    mse = 0
    for u, g, r in interactionsTrain:
        prediction = alpha + betaU[u] + betaI[g]
        mse += (r - prediction) ** 2
        
    # Calculate the regularization terms
    regularizeru = 0
    regularizeri = 0
    for u in betaU:
        regularizeru += betaU[u] ** 2
    for g in betaI:
        regularizeri += betaI[g] ** 2

    mse /= len(interactionsTrain)
    return mse, mse + lamu * regularizeru + lami * regularizeri


### Find lambda combinations 

In [None]:
pairs = []

# Iterate over different lambda values for regularization
for i in range(10):
    for j in range(10):
        print('lamda: ', i, 'lamu: ', j)
        
        # Initialize betaU and betaI dictionaries
        betaU = {}
        betaI = {}
        
        # Set initial values for betaU and betaI
        for u in hoursPerUser:
            betaU[u] = 0
        for g in hoursPerItem:
            betaI[g] = 0
        
        # Set initial value for alpha as the global average
        alpha = globalAverage 
        
        mse, objective = (100, 100)
        newMSE, newObjective = iterate(i, j, 0.78)
        itera = 0
        
        # Perform iterations until convergence or maximum number of iterations reached
        while itera < 10 or objective - newObjective > 0.01:
            mse, objective = newMSE, newObjective
            newMSE, newObjective = iterate(i, j, 0.78)
            itera += 1
            print("MSE after " + str(itera) + " iterations = " + str(newMSE))
            
            # Break the loop if maximum number of iterations reached
            if itera == 100:
                break
        
        # Calculate MSE on the test data
        msev = mseValidate()
        pairs.append((msev, i, j))
        print("MSE on test data = " + str(msev), 'MSE on train data: ', newMSE)


In [None]:
# sort pairs by mse
pairs.sort()

print(pairs[:10])

[(3.033369360703887, 1, 9), (3.0336400431759603, 2, 9), (3.033987729055787, 1, 8), (3.0342888951331837, 2, 8), (3.035230490333241, 0, 9), (3.0352734667980723, 1, 7), (3.035317365711952, 3, 9), (3.03560744068551, 2, 7), (3.035821592820544, 0, 8), (3.036000730854622, 3, 8)]


### Train

In [22]:
# Initialize betaU and betaI dictionaries
betaU = {}
betaI = {}

# Set initial values for betaU and betaI
for u in hoursPerUser:
    betaU[u] = 0

for g in hoursPerItem:
    betaI[g] = 0

alpha = globalAverage 

mse, objective = (100, 100)
newMSE, newObjective = iterate(1, 9, 0.78)
itera = 0

# Perform iterations until convergence or maximum number of iterations reached
while itera < 10 or objective - newObjective > 0.01:
    mse, objective = newMSE, newObjective
    newMSE, newObjective = iterate(1, 9, 0.78)
    itera += 1
    print("MSE after " + str(itera) + " iterations = " + str(mse))
    
    # Calculate MSE on the test data
    mse = mseValidate()
    print("MSE on test data = " + str(mse), 'MSE on train data: ', newMSE)
    
    # Break the loop if maximum number of iterations reached
    if itera == 100:
        break


MSE after 1 iterations = 2.964324216680804
MSE on test data = 3.0603090612938795 MSE on train data:  2.8010752933263463
MSE after 2 iterations = 2.8010752933263463
MSE on test data = 3.038502818055278 MSE on train data:  2.7751090158180425
MSE after 3 iterations = 2.7751090158180425
MSE on test data = 3.0346492696801435 MSE on train data:  2.770222810597672
MSE after 4 iterations = 2.770222810597672
MSE on test data = 3.033820087676113 MSE on train data:  2.7691980152341418
MSE after 5 iterations = 2.7691980152341418
MSE on test data = 3.0335852709066202 MSE on train data:  2.7689501988237657
MSE after 6 iterations = 2.7689501988237657
MSE on test data = 3.033498915319792 MSE on train data:  2.768876939362853
MSE after 7 iterations = 2.768876939362853
MSE on test data = 3.033461033195304 MSE on train data:  2.768848545271122
MSE after 8 iterations = 2.768848545271122
MSE on test data = 3.0334424431723503 MSE on train data:  2.768833632473361
MSE after 9 iterations = 2.768833632473361
M

### Data test output 

In [23]:
predictions = open("predictions_Hours.csv", 'w')
for l in open("./Data/pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    if u in betaU and g in betaI:
        pred = predict(u,g)
    elif u in betaU and g not in betaI:
        pred = alpha + betaU[u] +np.mean([b for b in betaI.values()])
    elif u not in betaU and g in betaI:
        pred = alpha + betaI[g] + np.mean([b for b in betaU.values()])
    else:
        pred = globalAverage
   
    _ = predictions.write(u + ',' + g + ',' + str(pred) + '\n')

predictions.close()