In [1]:
import json
import gzip
import math
import pandas as pd
from collections import defaultdict
import numpy as np
from sklearn import linear_model
import random
import statistics
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, precision_score
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import warnings

warnings.simplefilter('ignore')

In [3]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [4]:
answers = {}

In [5]:
z = gzip.open("./../data/train.json.gz")

In [6]:
dataset = []
for l in z:
    d = eval(l)
    dataset.append(d)

In [7]:
z.close()

In [8]:
### Question 1

In [9]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for d in dataset:
    u,i = d['userID'],d['gameID']
    reviewsPerUser[u].append(d)
    reviewsPerItem[i].append(d)
    
for u in reviewsPerUser:
    reviewsPerUser[u].sort(key=lambda x: x['date'])
    
for i in reviewsPerItem:
    reviewsPerItem[i].sort(key=lambda x: x['date'])

In [10]:
df = pd.DataFrame(dataset)

In [11]:
df['text length'] = df['text'].str.len()
df['ones'] = 1

In [12]:
X = df[['ones', 'hours']]
y = df[['text length']]

In [13]:
mod = linear_model.LinearRegression()
mod.fit(X,y)
predictions = mod.predict(X) 

In [14]:
theta_1 = mod.coef_[0][1]

In [15]:
mse_q1 = mean_squared_error(y, predictions)

In [16]:
answers['Q1'] = [theta_1, mse_q1]

In [17]:
assertFloatList(answers['Q1'], 2)

In [18]:
### Question 2

In [19]:
median_hours = df['hours'].median()

In [20]:
X['log hours'] = df['hours'].transform(lambda x: math.log2(x + 1))
X['sq root hours'] = df['hours'].transform(lambda x: math.sqrt(x))
X['above median'] = df['hours'].transform(lambda x: x > median_hours)
X

Unnamed: 0,ones,hours,log hours,sq root hours,above median
0,1,0.3,0.378512,0.547723,False
1,1,63.5,6.011227,7.968689,True
2,1,0.2,0.263034,0.447214,False
3,1,11.9,3.689299,3.449638,True
4,1,1.4,1.263034,1.183216,False
...,...,...,...,...,...
174995,1,98.6,6.638074,9.929753,True
174996,1,9.9,3.446256,3.146427,False
174997,1,5.4,2.678072,2.323790,False
174998,1,4.4,2.432959,2.097618,False


In [21]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [22]:
mse_q2 = mean_squared_error(y, predictions)

In [23]:
answers['Q2'] = mse_q2

In [24]:
assertFloat(answers['Q2'])

In [25]:
### Question 3

In [26]:
X = pd.DataFrame()
X['ones'] = df['ones']
X['lambda 1'] = df['hours'].transform(lambda x: x > 1)
X['lambda 2'] = df['hours'].transform(lambda x: x > 5)
X['lambda 3'] = df['hours'].transform(lambda x: x > 10)
X['lambda 4'] = df['hours'].transform(lambda x: x > 100)
X['lambda 5'] = df['hours'].transform(lambda x: x > 1000)
X

Unnamed: 0,ones,lambda 1,lambda 2,lambda 3,lambda 4,lambda 5
0,1,False,False,False,False,False
1,1,True,True,True,False,False
2,1,False,False,False,False,False
3,1,True,True,True,False,False
4,1,True,False,False,False,False
...,...,...,...,...,...,...
174995,1,True,True,True,False,False
174996,1,True,True,False,False,False
174997,1,True,True,False,False,False
174998,1,True,False,False,False,False


In [27]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [28]:
mse_q3 = mean_squared_error(y, predictions)

In [29]:
answers['Q3'] = mse_q3

In [30]:
assertFloat(answers['Q3'])

In [31]:
### Question 4

In [32]:
X = df[['ones', 'text length']]
X

Unnamed: 0,ones,text length
0,1,2
1,1,95
2,1,211
3,1,197
4,1,141
...,...,...
174995,1,760
174996,1,87
174997,1,29
174998,1,63


In [33]:
y = df[['hours']]

In [34]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [35]:
mse = mean_squared_error(y, predictions)
mae = mean_absolute_error(y, predictions)

In [36]:
mse, mae

(75735.70018272949, 90.35613031985211)

In [37]:
answers['Q4'] = [mse, mae, "MAE is better in this case because we are just trying to find the generalized result"]

In [38]:
assertFloatList(answers['Q4'][:2], 2)

In [39]:
### Question 5

In [40]:
y_trans = df['hours'].transform(lambda x: math.log2(x + 1))

In [41]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y_trans)
predictions_trans = mod.predict(X)

In [42]:
mse_trans = mean_squared_error(y_trans, predictions_trans)

In [43]:
predictions_untrans = list(map(lambda x: 2**x - 1, predictions_trans))

In [44]:
mse_untrans = mean_squared_error(y, predictions_untrans)

In [45]:
answers['Q5'] = [mse_trans, mse_untrans]

In [46]:
assertFloatList(answers['Q5'], 2)

In [47]:
### Question 6

In [48]:
def feat6(d):
    val = [0] * 100
    hours = int(d['hours'])
    if (hours > 99):
        val[99] = 1
    else:
        val[hours] = 1
    return val

In [49]:
X = [feat6(d) for d in dataset]
y = [len(d['text']) for d in dataset]

In [50]:
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [51]:
alphas = [1, 10, 100, 1000, 10000]
best_alpha = None
best_mse_val = float('inf')
best_mse_test = None

for alpha in alphas:
    # Fit Ridge regression model
    model = linear_model.Ridge(alpha=alpha)
    model.fit(Xtrain, ytrain)

    # Predict on the validation set
    y_pred_val = model.predict(Xvalid)

    # Calculate MSE on validation set
    mse_val = mean_squared_error(yvalid, y_pred_val)

    if mse_val < best_mse_val:
        best_mse_val = mse_val
        best_alpha = alpha
        # Predict on the test set with the best model
        y_pred_test = model.predict(Xtest)
        best_mse_test = mean_squared_error(ytest, y_pred_test)

In [52]:
best_alpha, best_mse_val, best_mse_test

(1000, 581432.8208480754, 560786.7645482325)

In [53]:
answers['Q6'] = [best_alpha, best_mse_val, best_mse_test]

In [54]:
assertFloatList(answers['Q6'], 3)

In [55]:
### Question 7

In [56]:
times = [d['hours_transformed'] for d in dataset]
median = statistics.median(times)

In [57]:
nNotPlayed = df['hours'].transform(lambda x: x < 1).sum()

In [58]:
answers['Q7'] = [median, nNotPlayed]

In [59]:
assertFloatList(answers['Q7'], 2)

In [60]:
### Question 8

In [61]:
X = df[['ones', 'text length']]
y = [d['hours_transformed'] > median for d in dataset]

In [62]:
mod = linear_model.LogisticRegression(class_weight='balanced')
mod.fit(X,y)
predictions = mod.predict(X) # Binary vector of predictions

In [63]:
def rates(predictions, y):
    TP = [a and b for (a,b) in zip(predictions,y)]
    TN = [not a and not b for (a,b) in zip(predictions,y)]
    FP = [a and not b for (a,b) in zip(predictions,y)]
    FN = [not a and b for (a,b) in zip(predictions,y)]

    TP = sum(TP)
    TN = sum(TN)
    FP = sum(FP)
    FN = sum(FN)
    
    return TP, TN, FP, FN

In [64]:
TP, TN, FP, FN = rates(predictions, y)

In [65]:
BER = 0.5 * (FP / (TN + FP) + FN / (FN + TP))

In [66]:
BER

0.472506390561468

In [67]:
answers['Q8'] = [TP, TN, FP, FN, BER]

In [68]:
assertFloatList(answers['Q8'], 5)

In [69]:
### Question 9

In [70]:

y_pred_prob = mod.predict_proba(X)[:, 1]  # Probabilities of being in the positive class
k_values = [5, 10, 100, 1000]
precisions_at_k = []

for k in k_values:
    # Sort predictions by predicted probability in descending order
    sorted_indices = np.argsort(-y_pred_prob)
    
    # Determine the threshold for the k-th ranked prediction
    threshold = y_pred_prob[sorted_indices[k - 1]]
    
    # Convert y and predictions to NumPy arrays
    y_np = np.array(y)
    predictions_np = np.array(predictions)
    
    # Find the indices of all predictions with probability >= threshold
    selected_indices = np.where(y_pred_prob >= threshold)[0]
    
    # Compute precision for these selected predictions
    precision = precision_score(y_np[selected_indices], predictions_np[selected_indices])
    
    precisions_at_k.append(precision)

# Print precision@k values
for i, k in enumerate(k_values):
    print(f'Precision@{k} = {precisions_at_k[i]:.4f}')


Precision@5 = 0.5455
Precision@10 = 0.5455
Precision@100 = 0.6700
Precision@1000 = 0.6850


In [71]:
answers['Q9'] = precisions_at_k

In [72]:
assertFloatList(answers['Q9'], 4)

In [73]:
### Question 10

In [74]:
y_trans = [d['hours_transformed'] for d in dataset]

In [75]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y_trans)
predictions_trans = mod.predict(X)

In [76]:
your_threshold = 3.7
predictions_thresh = list(map(lambda x: x > your_threshold, predictions_trans))
TP, TN, FP, FN = rates(predictions_thresh, y_trans)
BER = 0.5 * (FP / (TN + FP) + FN / (FN + TP))
BER

0.4260170326351089

In [77]:
answers['Q10'] = [your_threshold, BER]

In [78]:
assertFloatList(answers['Q10'], 2)

In [79]:
### Question 11

In [80]:
dataTrain = dataset[:int(len(dataset)*0.9)]
dataTest = dataset[int(len(dataset)*0.9):]

In [81]:
df_train = pd.DataFrame(dataTrain)
df_test = pd.DataFrame(dataTest)

In [82]:
userMedian = defaultdict(list)
itemMedian = defaultdict(list)

reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

ratingPerUser = defaultdict(list)
ratingPerItem = defaultdict(list)


for d in dataset:
    u,i,h_t = d['userID'],d['gameID'], d['hours_transformed']
    ratingPerUser[u].append(h_t)
    ratingPerItem[i].append(h_t)

for user in ratingPerUser:
    userMedian[user] = statistics.median(ratingPerUser[user])

for item in ratingPerItem:
    itemMedian[item] = statistics.median(ratingPerItem[item])

for user in ratingPerUser:
    userMedian[user] = statistics.median(ratingPerUser[user])

for item in ratingPerItem:
    itemMedian[item] = statistics.median(ratingPerItem[item])

In [83]:
answers['Q11'] = [itemMedian['g35322304'], userMedian['u55351001']]

In [84]:
assertFloatList(answers['Q11'], 2)

In [85]:
### Question 12

In [86]:
global_median = df_train['hours_transformed'].median()
def f12(u,i):
    if u in ratingPerUser:
        if i in ratingPerItem:
            if itemMedian[i] > global_median:
                return 1
        elif userMedian[u] > global_median:
                return 1
    return 0

In [87]:
preds = [f12(d['userID'], d['gameID']) for d in dataTest]

In [88]:
global_median

3.4724877714627436

In [89]:
y = df_test['hours_transformed'] > global_median

In [90]:
accuracy = accuracy_score(y, preds)

In [91]:
accuracy

0.7468

In [92]:
answers['Q12'] = accuracy

In [93]:
assertFloat(answers['Q12'])

In [94]:
### Question 13

In [95]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}

for d in dataset:
    user,item = d['userID'], d['gameID']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

In [96]:
def Jaccard(s1, s2):
    numerator = len(s1. intersection (s2))
    denominator = len(s1.union(s2))
    return numerator / denominator

In [97]:
def mostSimilar (i, K): 
    similarities = []
    users = usersPerItem [i] # Users who have purchased i
    for j in usersPerItem : # Compute similarity against each
        if j == i: continue
        sim = Jaccard(users , usersPerItem [j])
        similarities .append ((sim ,j))
    similarities .sort(reverse=True) # Sort to find the most
    return similarities [:K]

In [98]:
ms = mostSimilar(dataset[0]['gameID'], 10)

In [99]:
answers['Q13'] = [ms[0][0], ms[-1][0]]

In [100]:
assertFloatList(answers['Q13'], 2)

In [101]:
### Question 14

In [102]:
def Cosine(i1, i2, ratingDict):
    # Between two items
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += ratingDict[(u, i1)] * ratingDict[(u, i2)]
    for u in usersPerItem[i1]:
        denom1 += ratingDict[(u, i1)]**2
    for u in usersPerItem[i2]:
        denom2 += ratingDict[(u, i2)]**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0:
        return 0
    return numer / denom

def mostSimilar(i, K, ratingDict):
    similarities = []
    users = usersPerItem[i]  # Users who have purchased item i
    for j in usersPerItem:  # Compute similarity against each item
        if j == i:
            continue
        sim = Cosine(i, j, ratingDict)
        similarities.append((sim, j))
    similarities.sort(reverse=True, key=lambda x: x[0])  # Sort to find the most similar
    return similarities[:K]

# Initialize ratingDict based on your dataset
ratingDict = {}
for d in dataset:
    u, i = d['userID'], d['gameID']
    lab = 1 if d['hours_transformed'] > global_median else -1
    ratingDict[(u, i)] = lab

# Calculate the most similar items to the first gameID in the dataset
ms = mostSimilar(dataset[0]['gameID'], 10, ratingDict)

In [103]:
answers['Q14'] = [ms[0][0], ms[-1][0]]

In [104]:
assertFloatList(answers['Q14'], 2)

In [105]:
### Question 15

In [106]:
ratingDict = {}

for d in dataset:
    u,i = d['userID'], d['gameID']
    lab = d['hours_transformed']
    ratingDict[(u,i)] = lab

In [107]:
ms = mostSimilar(dataset[0]['gameID'], 10, ratingDict)

In [108]:
answers['Q15'] = [ms[0][0], ms[-1][0]]

In [109]:
assertFloatList(answers['Q15'], 2)

In [110]:
f = open("answers_midterm.txt", 'w')
f.write(str(answers) + '\n')
f.close()

In [111]:
answers

{'Q1': [0.007857269704335982, 570936.2842458971],
 'Q2': 565419.5340402178,
 'Q3': 565405.439588582,
 'Q4': [75735.70018272949,
  90.35613031985211,
  'MAE is better in this case because we are just trying to find the generalized result'],
 'Q5': [5.255254235328314, 78668.5650295673],
 'Q6': [1000, 581432.8208480754, 560786.7645482325],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.472506390561468],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685],
 'Q10': [3.7, 0.4260170326351089],
 'Q11': [0.5849625007211562, 2.350497247084133],
 'Q12': 0.7468,
 'Q13': [0.07988165680473373, 0.04390243902439024],
 'Q14': [0.10251693271055495, 0.061667331307041336],
 'Q15': [0.3301567230633555, 0.12290154232706599]}