In [150]:
import json
import gzip
import math
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn import linear_model
import random
import statistics

In [151]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [152]:
answers = {}

In [153]:
# From https://cseweb.ucsd.edu/classes/fa24/cse258-b/files/steam.json.gz

from google.colab import drive
drive.mount('/content/drive')

z = gzip.open("/content/drive/My Drive/DSC 256R/steam.json.gz")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [154]:
dataset = []
for l in z:
    d = eval(l)
    dataset.append(d)

In [155]:
z.close()

In [156]:
### Question 1

In [157]:
def MSE(y, ypred):
    diffs = [(a-b)**2 for (a,b) in zip(y,ypred)]
    return sum(diffs) / len(diffs)

In [158]:
def MAE(y, ypred):
    diffs = [math.fabs(a-b) for (a,b) in zip(y,ypred)]
    return sum(diffs) / len(diffs)

In [159]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for d in dataset:
    u,i = d['userID'],d['gameID']
    reviewsPerUser[u].append(d)
    reviewsPerItem[i].append(d)

for u in reviewsPerUser:
    reviewsPerUser[u].sort(key=lambda x: x['date'])

for i in reviewsPerItem:
    reviewsPerItem[i].sort(key=lambda x: x['date'])

In [160]:
def feat1(d):
  return [1, len(d['text']) ]

In [161]:
# Prepare the data: feature X (review length) and target y (hours played)
X = [feat1(d) for d in dataset]
y = [d['hours'] for d in dataset]

In [162]:
# Make predictions and calculate MSE
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [163]:
mse1 = MSE(predictions, y)

In [164]:
mse1

75735.70018273

In [165]:
mod.coef_

array([6.60006985e+01, 1.04228062e-03])

In [167]:
answers['Q1'] = [float(mod.coef_[1]), float(mse1)] # Remember to cast things to float rather than (e.g.) np.float64

In [168]:
assertFloatList(answers['Q1'], 2)

In [169]:
### Question 2

In [170]:
dataTrain = dataset[:int(len(dataset)*0.8)]
dataTest = dataset[int(len(dataset)*0.8):]

In [171]:
# Prepare the training and testing data
Xtrain = [feat1(d) for d in dataTrain]
ytrain = [d['hours'] for d in dataTrain]
Xtest = [feat1(d) for d in dataTest]
ytest = [d['hours'] for d in dataTest]

In [172]:
# Train the model using the training set
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(Xtrain, ytrain)
ypred = mod.predict(Xtest)

In [173]:
mse2 = MSE(ytest, ypred)

In [174]:
under = 0
over = 0

# Loop through each pair of true and predicted values in the test set
for actual, predicted in zip(ytest, ypred):
    if predicted < actual:
        under += 1  # Increment under if prediction is less than actual
    elif predicted > actual:
        over += 1   # Increment over if prediction is more than actual

In [175]:
mse2, under, over

(76047.19578054524, 5249, 29751)

In [176]:
answers['Q2'] = [mse2, under, over]

In [177]:
assertFloatList(answers['Q2'], 3)

In [178]:
### Question 3

In [179]:
# (a) Removing Outliers from the Training Set

y2 = y[:]
y2.sort()
perc90 = y2[int(len(y2)*0.9)] # 90th percentile

# Filter to keep only instances with target (hours) values below the 90th percentile
X3a = [X for X, y in zip(Xtrain, ytrain) if y <= perc90]
y3a = [y for y in ytrain if y <= perc90]


# Train the model without the top 10% outliers
mod3a = linear_model.LinearRegression(fit_intercept=False)
mod3a.fit(X3a,y3a)
pred3a = mod3a.predict(Xtest)

In [180]:
# Count under and over predictions for model 3a
under3a = 0
over3a = 0

for actual, predicted in zip(ytest, pred3a):
    if predicted < actual:
        under3a += 1  # Increment under if prediction is less than actual
    elif predicted > actual:
        over3a += 1   # Increment over if prediction is more than actual

In [181]:
# (b) Log Transformation on Target Variable

ytrain = np.array(ytrain)

# Transform ytrain using log2(hours + 1)
y3b = np.log2(ytrain + 1)

# Train the model on the log-transformed target
mod3b = linear_model.LinearRegression(fit_intercept=False)
mod3b.fit(Xtrain, y3b)

# Make predictions and transform back to original scale
pred3b_log = mod3b.predict(Xtest)
pred3b = [2 ** p - 1 for p in pred3b_log]  # Inverse transform: log2(hours + 1) back to hours

In [182]:
# Count under and over predictions for model 3b
under3b = 0
over3b = 0

for actual, predicted in zip(ytest, pred3b):
    if predicted < actual:
        under3b += 1  # Increment under if prediction is less than actual
    elif predicted > actual:
        over3b += 1   # Increment over if prediction is more than actual

In [183]:
# (c) Adjust Parameters to Match Median Values

# Using theta0 from Q2 and solving for theta1 manually
theta0 = mod.intercept_

length = [len(d['text']) for d in dataTrain]
length.sort()
median_length = length[len(length)//2]

hours = [d['hours'] for d in dataTrain]
hours.sort()
median_hours = hours[len(hours)//2]

# Calculate theta1 such that the line passes through (0, theta0) and (median_length, median_hours)
theta1 = (median_hours - theta0) / median_length

# Predict on test set using the manually calculated parameters
pred3c = [theta0 + theta1 * len(d['text']) for d in dataTest]

In [184]:
# Count under and over predictions for model 3c
under3c = 0
over3c = 0

for actual, predicted in zip(ytest, pred3c):
    if predicted < actual:
        under3c += 1  # Increment under if prediction is less than actual
    elif predicted > actual:
        over3c += 1   # Increment over if prediction is more than actual

In [185]:
[under3a, over3a, under3b, over3b, under3c, over3c]

[13084, 21916, 15941, 19059, 17112, 17888]

In [186]:
answers['Q3'] = [under3a, over3a, under3b, over3b, under3c, over3c]

In [187]:
assertFloatList(answers['Q3'], 6)

In [39]:
### Question 4

In [188]:
def feat4(d):
    return [1, len(d['text'])]

In [189]:
Xtrain = [feat4(d) for d in dataTrain]
Xtest = [feat4(d) for d in dataTest]

In [190]:
ytrain = [d['hours'] > median_hours for d in dataTrain]
ytest = [d['hours'] > median_hours for d in dataTest]

In [191]:
mod = linear_model.LogisticRegression(C=1, fit_intercept=True)
mod.fit(Xtrain,ytrain)
predictions = mod.predict(Xtest) # Binary vector of predictions

In [192]:
def rates(predictions, y):

    TP = [a and b for (a,b) in zip(predictions,y)]
    TN = [not a and not b for (a,b) in zip(predictions,y)]
    FP = [a and not b for (a,b) in zip(predictions,y)]
    FN = [not a and b for (a,b) in zip(predictions,y)]

    TP = sum(TP)
    TN = sum(TN)
    FP = sum(FP)
    FN = sum(FN)

    return TP, TN, FP, FN

In [193]:
TP, TN, FP, FN = rates(predictions, ytest)

In [194]:
BER = 0.5 * (FP / (TN + FP) + FN / (FN + TP))

In [195]:
[TP, TN, FP, FN, BER]

[4525, 13909, 3641, 12925, 0.474076033273741]

In [196]:
answers['Q4'] = [TP, TN, FP, FN, BER]

In [197]:
assertFloatList(answers['Q4'], 5)

In [50]:
### Question 5

In [198]:
overpredictions = FP
underpredictions = FN

[overpredictions, underpredictions]

[3641, 12925]

In [199]:
answers['Q5'] = [overpredictions, underpredictions]

In [200]:
assertFloatList(answers['Q5'], 2)

In [54]:
### Question 6

In [201]:
# Filter the dataset by year
data_2014_or_earlier = [d for d in dataset if int(d['date'][:4]) <= 2014]
data_2015_or_later = [d for d in dataset if int(d['date'][:4]) >= 2015]

In [202]:
# Prepare training and testing sets based on conditions
X2014 = [[len(d['text'])] for d in data_2014_or_earlier]
y2014 = [1 if d['hours'] > median_hours else 0 for d in data_2014_or_earlier]

X2015 = [[len(d['text'])] for d in data_2015_or_later]
y2015 = [1 if d['hours'] > median_hours else 0 for d in data_2015_or_later]

# Split X2014 and y2014 into training and test sets
X2014train = X2014[:int(0.8 * len(X2014))]
y2014train = y2014[:int(0.8 * len(y2014))]
X2014test = X2014[int(0.8 * len(X2014)):]
y2014test = y2014[int(0.8 * len(y2014)):]

# Split X2015 and y2015 into training and test sets
X2015train = X2015[:int(0.8 * len(X2015))]
y2015train = y2015[:int(0.8 * len(y2015))]
X2015test = X2015[int(0.8 * len(X2015)):]
y2015test = y2015[int(0.8 * len(y2015)):]

In [203]:
# (a) Model trained and tested on reviews from 2014 or earlier
model_a = linear_model.LogisticRegression(C=1)
model_a.fit(X2014train, y2014train)
pred_a = model_a.predict(X2014test)
TP_A, TN_A, FP_A, FN_A = rates(pred_a, y2014test)
BER_A = 0.5 * (FP_A / (TN_A + FP_A) + FN_A / (FN_A + TP_A))

# (b) Model trained and tested on reviews from 2015 or later
model_b = linear_model.LogisticRegression(C=1)
model_b.fit(X2015train, y2015train)
pred_b = model_b.predict(X2015test)
TP_B, TN_B, FP_B, FN_B = rates(pred_b, y2015test)
BER_B = 0.5 * (FP_B / (TN_B + FP_B) + FN_B / (FN_B + TP_B))

# (c) Model trained on 2014 or earlier, tested on 2015 or later
model_c = linear_model.LogisticRegression(C=1)
model_c.fit(X2014train, y2014train)
pred_c = model_c.predict(X2015test)
TP_C, TN_C, FP_C, FN_C = rates(pred_c, y2015test)
BER_C = 0.5 * (FP_C / (TN_C + FP_C) + FN_C / (FN_C + TP_C))

# (d) Model trained on 2015 or later, tested on 2014 or earlier
model_d = linear_model.LogisticRegression(C=1)
model_d.fit(X2015train, y2015train)
pred_d = model_d.predict(X2014test)
TP_D, TN_D, FP_D, FN_D = rates(pred_d, y2014test)
BER_D = 0.5 * (FP_D / (TN_D + FP_D) + FN_D / (FN_D + TP_D))

In [204]:
[BER_A, BER_B, BER_C, BER_D]

[0.4799701888132907,
 0.4738306049759501,
 0.4822228416597866,
 0.47245350711806977]

In [205]:
answers['Q6'] = [BER_A, BER_B, BER_C, BER_D]

In [206]:
assertFloatList(answers['Q6'], 4)

In [61]:
### Question 7

In [208]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

# Create a mapping of each user to the set of items (games) they rated
for d in dataTrain:
    user,item = d['userID'], d['gameID']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

In [209]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [210]:
def mostSimilar(i, func, N):
    similarities = []
    items = itemsPerUser[i]
    for i2 in itemsPerUser:
        if i2 == i: continue
        sim = func(items, itemsPerUser[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [211]:
ms = mostSimilar(dataset[0]['userID'], Jaccard, 10)

In [212]:
ms

[(0.10909090909090909, 'u36549785'),
 (0.09836065573770492, 'u56680455'),
 (0.09722222222222222, 'u47732229'),
 (0.09375, 'u18564530'),
 (0.09230769230769231, 'u29932810'),
 (0.08860759493670886, 'u87833081'),
 (0.08791208791208792, 'u58835174'),
 (0.08695652173913043, 'u02115872'),
 (0.08333333333333333, 'u73580608'),
 (0.08235294117647059, 'u71367785')]

In [213]:
first = ms[0][0]
tenth = ms[9][0]

first, tenth

(0.10909090909090909, 0.08235294117647059)

In [214]:
answers['Q7'] = [first, tenth]

In [215]:
assertFloatList(answers['Q7'], 2)

In [None]:
### Question 8

In [216]:
# Calculate global average of transformed hours in training set
transformed_hours = [np.log2(d['hours'] + 1) for d in dataTrain]
global_average = np.mean(transformed_hours)

In [217]:
# Create dictionaries mapping users to items and items to users with ratings
user_items = defaultdict(set)
item_users = defaultdict(set)
user_item_ratings = defaultdict(lambda: defaultdict(float))

for d in dataTrain:
    user = d['userID']
    item = d['gameID']
    rating = np.log2(d['hours'] + 1)  # Transformed hours
    user_items[user].add(item)
    item_users[item].add(user)
    user_item_ratings[user][item] = rating


In [234]:
# User-to-User Predictor
def user_to_user_predict(user, item):
    if user not in user_items or item not in item_users:
        return global_average  # Handle unseen user/item

    total_similarity = 0
    weighted_sum = 0

    for similar_user in item_users[item]:
        if similar_user != user:
            similarity = Jaccard(user_items[user], user_items[similar_user])
            weighted_sum += similarity * user_item_ratings[similar_user][item]
            total_similarity += similarity

    return weighted_sum / total_similarity if total_similarity > 0 else global_average


In [219]:
# Item-to-Item Predictor
def item_to_item_predict(user, item):
    if user not in user_items or item not in item_users:
        return global_average  # Handle unseen user/item

    total_similarity = 0
    weighted_sum = 0

    for similar_item in user_items[user]:
        if similar_item != item:
            similarity = Jaccard(item_users[item], item_users[similar_item])
            weighted_sum += similarity * user_item_ratings[user][similar_item]
            total_similarity += similarity

    return weighted_sum / total_similarity if total_similarity > 0 else global_average


In [220]:
# Evaluate both predictors on the test set
y_true = []
y_pred_user = []
y_pred_item = []

for d in dataTest:
    user = d['userID']
    item = d['gameID']
    true_rating = np.log2(d['hours'] + 1)
    y_true.append(true_rating)
    y_pred_user.append(user_to_user_predict(user, item))
    y_pred_item.append(item_to_item_predict(user, item))

# Calculate MSE for both predictors
MSEU = MSE(y_true, y_pred_user)
MSEI = MSE(y_true, y_pred_item)

In [221]:
y_pred_user

[4.703561320793294,
 2.777465898549462,
 5.820349483010912,
 3.9236443986931446,
 2.161973295965135,
 1.579348274275264,
 6.699365724861088,
 2.062660856153714,
 4.961332600396564,
 1.3668070334603386,
 2.16675729451033,
 2.9474601385394217,
 3.882139223146527,
 3.3346875449969176,
 2.225815813626644,
 2.242646388357799,
 5.375740936469531,
 2.2211386977078362,
 1.3975325444586142,
 3.1925320452382677,
 4.384632594063056,
 3.203449156796085,
 2.6156145082804496,
 5.720124254064023,
 3.530988393902114,
 2.326441849743425,
 3.718409244521891,
 5.425977903505087,
 7.488428236465365,
 4.362207447749668,
 3.083334639915962,
 2.3448389655904522,
 5.312190131592381,
 4.386918447109663,
 2.0891690253148107,
 7.695835387899068,
 2.3846170014178516,
 3.4998596515459974,
 4.758185292732859,
 1.6954816570004523,
 5.723662666470355,
 3.6417667978031663,
 4.481648503677088,
 3.8748466998926805,
 2.4336053316261936,
 2.91754529996511,
 7.658544699564143,
 1.5076743768381633,
 3.5110608482501466,
 2.9

In [222]:
MSEU, MSEI

(3.28107684594118, 4.915274596519449)

In [223]:
answers['Q8'] = [MSEU, MSEI]

In [224]:
assertFloatList(answers['Q8'], 2)

In [None]:
### Question 9

In [262]:
# Create dictionaries mapping users to items, items to users, and store ratings and review years
user_items = defaultdict(set)
item_users = defaultdict(set)
user_item_ratings = defaultdict(lambda: defaultdict(float))
user_item_years = defaultdict(lambda: defaultdict(int))

In [263]:
for d in dataTrain:
    user = d['userID']
    item = d['gameID']
    rating = np.log2(d['hours'] + 1)  # Transformed hours
    year = int(d['date'][:4])  # Extract the year from the date

    user_items[user].add(item)
    item_users[item].add(user)
    user_item_ratings[user][item] = rating
    user_item_years[user][item] = year

In [267]:
# Time-Weighted User-to-User Predictor
def user_to_user_predict(user, item):
    if user not in user_items or item not in item_users:
        return global_average  # Handle unseen user/item

    total_similarity = 0
    weighted_sum = 0
    user_review_year = user_item_years[user].get(item, 0)

    for similar_user in item_users[item]:
        if similar_user != user:
            similarity = Jaccard(user_items[user], user_items[similar_user])
            similar_user_review_year = user_item_years[similar_user].get(item, 0)
            time_weight = math.exp(-0.05 * abs(user_review_year - similar_user_review_year))    # Use 0.05 as extra weighting factor to reduce the impact of the time difference and prevent near zero values, forcing a return of global_average all the time
            weighted_similarity = similarity * time_weight
            weighted_sum += weighted_similarity * user_item_ratings[similar_user][item]
            total_similarity += weighted_similarity

    return weighted_sum / total_similarity if total_similarity > 0 else global_average

In [268]:
# Evaluate the test set
y_true = []
y_pred_user = []

for d in dataTest:
    user = d['userID']
    item = d['gameID']
    true_rating = np.log2(d['hours'] + 1)
    y_true.append(true_rating)
    y_pred_user.append(user_to_user_predict(user, item))

In [269]:
y_pred_user

[4.7655562693925875,
 2.778296232734877,
 5.800805868775264,
 3.917234297478253,
 2.1496957864128023,
 1.5568076110449003,
 6.7043065322652495,
 2.0658243307660173,
 4.941987079063724,
 1.3838173266499045,
 2.1458997245846647,
 2.9890261453363105,
 3.8821392231465275,
 3.3115895706330782,
 2.225815813626644,
 2.245216211061626,
 5.380387722637519,
 2.23329896643083,
 1.3926299294652567,
 3.2179486754249376,
 4.383379720008016,
 3.2096111700349255,
 2.6316322883964025,
 5.710053217649854,
 3.5219919976414493,
 2.334768467337799,
 3.718409244521891,
 5.4019777925044155,
 7.506739763590249,
 4.575660841978031,
 3.087690624116537,
 2.33967595007248,
 5.364838783732795,
 4.418905631371956,
 2.090408480550711,
 7.697384801181451,
 2.39013017982341,
 3.519385766514928,
 4.771769777854073,
 1.701391720831745,
 5.712806519749437,
 3.5814243537680337,
 4.481813979615029,
 3.8829401916096447,
 2.4243411432127333,
 2.91450820851891,
 7.653850261876107,
 1.4840093865383701,
 3.5581516051072586,
 2.

In [270]:
# Calculate MSE
MSE9 = MSE(y_true, y_pred_user)

MSE9

3.281122981912087

In [271]:
answers['Q9'] = MSE9

In [272]:
assertFloat(answers['Q9'])

In [273]:
if "float" in str(answers) or "int" in str(answers):
    print("it seems that some of your answers are not native python ints/floats;")
    print("the autograder will not be able to read your solution unless you convert them to ints/floats")

In [275]:
f = open("answers_midterm.txt", 'w')
f.write(str(answers) + '\n')
f.close()