In [1]:
import json
import gzip
import math
from collections import defaultdict
import numpy
from sklearn import linear_model

In [2]:
# This will suppress any warnings, comment out if you'd like to preserve them
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Check formatting of submissions
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [4]:
answers = {}

In [5]:
f = open("spoilers.json.gz", 'r')

In [6]:
dataset = []
for l in f:
    d = eval(l)
    dataset.append(d)

In [7]:
f.close()

In [8]:
dataset[0]

{'user_id': 'b0d7e561ca59e313b728dc30a5b1862e',
 'timestamp': '2013-05-06',
 'review_sentences': [[0,
   'The author did an excellent job of making a very readable novel about the emotional lives of Hadley and Ernest Hemingway.'],
  [0,
   'The many other creative people interacting with them in Paris in the 1920s were very colorful and interesting too.'],
  [0,
   'She captured a wonderful snapshot of the 1920s in Europe--the men returning from war, the writing, the music, the art, the fashions, the eating and drinking.'],
  [0,
   'Mostly told from the viewpoint of Hadley, the book shows she was very attracted to Ernest who was extroverted, interesting, ambitious, and a gifted writer.'],
  [0,
   'Hadley had a sweetness, and more traditional values than some of their friends in Paris.'],
  [0,
   'They both came from families with domineering mothers and suicidal fathers.'],
  [0,
   'Ernest was still suffering the traumatic effects of his time in World War I. He was also very self a

In [9]:
# A few utility data structures
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for d in dataset:
    u,i = d['user_id'],d['book_id']
    reviewsPerUser[u].append(d)
    reviewsPerItem[i].append(d)

# Sort reviews per user by timestamp
for u in reviewsPerUser:
    reviewsPerUser[u].sort(key=lambda x: x['timestamp'])
    
# Same for reviews per item
for i in reviewsPerItem:
    reviewsPerItem[i].sort(key=lambda x: x['timestamp'])

In [10]:
reviewsPerUser["b0d7e561ca59e313b728dc30a5b1862e"]

[{'user_id': 'b0d7e561ca59e313b728dc30a5b1862e',
  'timestamp': '2012-03-13',
  'review_sentences': [[0,
    'The Grapes of Wrath is set during the Great Depression when times were terribly hard for the farmers in the Dust Bowl.'],
   [0,
    'Drought, inability to pay back loans, and the movement of large agricultural companies to take over the small farms all led to a bad economic situation.'],
   [0,
    'The Joads can no longer farm in Oklahoma, and they have piled their possessions on top of an old truck and headed down Route 66 to California.'],
   [0,
    'They are hoping for high pay picking crops, but there are so many workers heading west that the owners of the large farms are only giving them a pittance.'],
   [0,
    'People are starving and dying while the corporate farmers are in collusion with the police to arrest anyone who objects or tries to unionize.'],
   [0,
    'A bright spot is their stay at a federal camp operated by a New Deal agency that helps the migrant work

In [11]:
# E.g. reviews for this user are sorted from earliest to most recent
[d['timestamp'] for d in reviewsPerUser['b0d7e561ca59e313b728dc30a5b1862e']]

['2012-03-13',
 '2013-05-06',
 '2013-09-03',
 '2015-04-05',
 '2016-02-10',
 '2016-05-29']

In [12]:
### 1a

In [13]:
userAverages = {}
itemAverages = {}
userAverages1 = {} # Skips the last entry
itemAverages1 = {}

for u in reviewsPerUser:
    rs = [d['rating'] for d in reviewsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    rs1 = rs[:-1]
    if len(rs1):
        userAverages1[u] = sum(rs1) / len(rs1)

for i in reviewsPerItem:
    rs = [d['rating'] for d in reviewsPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)
    rs1 = rs[:-1]
    if len(rs1):
        itemAverages1[i] = sum(rs1) / len(rs1)

In [14]:
userAverages1

{'b0d7e561ca59e313b728dc30a5b1862e': 4.2,
 '025f6535878e5bb486ac3869e3c6e35a': 3.5,
 '3037e45de740da82703e55a19f94cfbe': 3.75,
 '9c0d9df5cf36d370f6e3945ebe95fd57': 5.0,
 '34e83c43b312a8613706367c32df26e3': 4.166666666666667,
 'ba35836a9e86f764f28b61c8dbc8fe01': 4.25,
 '5d6e554bc8961d4464d64c0184c88936': 4.0,
 '510b63e8bbdf635b13e8932fd67be15a': 4.75,
 '6079cf8b963518fdf7539a289e72e5b6': 3.3846153846153846,
 '50e049989a18928611fd83d1209bfed4': 3.9,
 'f115dc148ad8536b6fbc38abf48da61e': 3.8333333333333335,
 '1fc58f53eb67bcf6c78af62087697449': 4.3,
 '2334fbd037bdac2509f55ac2c4ed5def': 3.75,
 'aca892063124d76e701c739cc6348f00': 4.833333333333333,
 'c3fad585250f725cd9cb4271164d4469': 4.333333333333333,
 '0ec6d7cbf7fd22a6bc82c7d3a5215e4e': 4.4,
 'dfee8ffe6c324e786f6648f4e8dec6fe': 3.0,
 'b4f502353837efb1a799e5499a713682': 4.0,
 '4e123830b00d61a272259cd581340f2f': 4.066666666666666,
 '9e72362bebbdf3fe75790a8c4a44aba7': 4.0,
 '1b2bad2d45d3cd4298d368664c40150a': 4.0,
 '042ec3a109146c8e288a9ef0b1

In [15]:
y = []
ypred = []
for u in reviewsPerUser:
    for r in reviewsPerUser[u]:
        if u in userAverages1:
            y.append(r['rating'])
            ypred.append(userAverages1[u])

In [16]:
def MSE(y, ypred):
    ds = [(a-b)**2 for (a,b) in zip(y, ypred)]
    return sum(ds) / len(ds)

In [17]:
answers['Q1a'] = MSE(y,ypred)

In [18]:
assertFloat(answers['Q1a'])

In [19]:
### 1b

In [20]:
y = []
ypred = []
for i in reviewsPerItem:
    for r in reviewsPerItem[i]:
        if i in itemAverages1:
            y.append(r['rating'])
            ypred.append(itemAverages1[i])

In [21]:
answers['Q1b'] = MSE(y,ypred)

In [22]:
assertFloat(answers['Q1b'])

In [23]:
answers

{'Q1a': 1.1412831672957289, 'Q1b': 1.2232617119559104}

In [24]:
### 2

In [25]:
userAveragesN = [{}, {}, {}]
itemAveragesN = [{}, {}, {}] # Last 1, 2, 3

for u in reviewsPerUser:
    for i in range(3):
        rs = [d['rating'] for d in reviewsPerUser[u]]
        if len(rs) < i + 2:
            continue
        rs = rs[-(i+2):-1]
        userAveragesN[i][u] = sum(rs) / len(rs)

In [26]:
userAveragesN[0]["b0d7e561ca59e313b728dc30a5b1862e"]

4.0

In [27]:
answers['Q2'] = []

for N in [1,2,3]:
    y = []
    ypred = []
    
    for u in reviewsPerUser:
        for r in reviewsPerUser[u]:
            if u in userAveragesN[N-1]:
                y.append(r['rating'])
                ypred.append(userAveragesN[N-1][u])
    answers['Q2'].append(MSE(y,ypred))

In [28]:
answers

{'Q1a': 1.1412831672957289,
 'Q1b': 1.2232617119559104,
 'Q2': [2.072069253233208, 1.4872838998000706, 1.3331403382817468]}

In [29]:
assertFloatList(answers['Q2'], 3)

In [30]:
### 3a

In [31]:
def feature3(N, u): # For a user u and a window size of N
    x = [1]
    rs = [d['rating'] for d in reviewsPerUser[u]]
    for n in range(N):
        x.append(rs[-(n+2)])
    return x

In [32]:
answers['Q3a'] = [feature3(2,dataset[0]['user_id']), feature3(3,dataset[0]['user_id'])]

In [33]:
assert len(answers['Q3a']) == 2
assert len(answers['Q3a'][0]) == 3
assert len(answers['Q3a'][1]) == 4

In [34]:
answers['Q3a']

[[1, 4, 4], [1, 4, 4, 4]]

In [35]:
### 3b

In [36]:
answers['Q3b'] = []

for N in [1,2,3]:
    X = [feature3(N, u) for u in reviewsPerUser if len(reviewsPerUser[u]) > N+1]
    y = [reviewsPerUser[u][-1]['rating'] for u in reviewsPerUser if len(reviewsPerUser[u]) > N+1]
    mod = linear_model.LinearRegression()
    mod.fit(X, y)
    ypred = mod.predict(X)
    mse = MSE(y, ypred)
    answers['Q3b'].append(mse)

In [37]:
assertFloatList(answers['Q3b'], 3)

In [38]:
answers

{'Q1a': 1.1412831672957289,
 'Q1b': 1.2232617119559104,
 'Q2': [2.072069253233208, 1.4872838998000706, 1.3331403382817468],
 'Q3a': [[1, 4, 4], [1, 4, 4, 4]],
 'Q3b': [1.5723325467509277, 1.5610886662971608, 1.5236978010116853]}

In [39]:
### 4a

In [40]:
globalAverage = [d['rating'] for d in dataset]
globalAverage = sum(globalAverage) / len(globalAverage)

In [41]:
def featureMeanValue(N, u): # For a user u and a window size of N
    f = [1]
    for n in range(N):
        try:
            previous_val = reviewsPerUser[u][-1 - (n+1)]['rating']
            f.append(previous_val)
        except Exception as e:
            try:
                f.append(userAverages1[u])
            except Exception as e2:
                f.append(globalAverage)
    return f

In [42]:
def featureMissingValue(N, u):
    f = [1]
    for n in range(N):
        try:
            previous_val = reviewsPerUser[u][-1 - (n+1)]['rating']
            f += [0, previous_val]
        except Exception as e:
            f += [1, 0]
    return f

In [43]:
answers['Q4a'] = [featureMeanValue(10, dataset[0]['user_id']), featureMissingValue(10, dataset[0]['user_id'])]

In [44]:
assert len(answers['Q4a']) == 2
assert len(answers['Q4a'][0]) == 11
assert len(answers['Q4a'][1]) == 21

In [45]:
answers

{'Q1a': 1.1412831672957289,
 'Q1b': 1.2232617119559104,
 'Q2': [2.072069253233208, 1.4872838998000706, 1.3331403382817468],
 'Q3a': [[1, 4, 4], [1, 4, 4, 4]],
 'Q3b': [1.5723325467509277, 1.5610886662971608, 1.5236978010116853],
 'Q4a': [[1, 4, 4, 4, 4, 5, 4.2, 4.2, 4.2, 4.2, 4.2],
  [1, 0, 4, 0, 4, 0, 4, 0, 4, 0, 5, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]]}

In [46]:
### 4b

In [47]:
answers['Q4b'] = []

for featFunc in [featureMeanValue, featureMissingValue]:
    X = [featFunc(N, u) for u in reviewsPerUser]
    y = [reviewsPerUser[u][-1]['rating'] for u in reviewsPerUser]
    mod = linear_model.LinearRegression()
    mod.fit(X, y)
    pred = mod.predict(X)
    mse = MSE(pred, y)
    answers['Q4b'].append(mse)

In [48]:
assertFloatList(answers["Q4b"], 2)

In [49]:
answers["Q4b"]

[1.5505122537180556, 1.5432722347239864]

In [50]:
### 5

In [51]:
def feature5(sentence):
    f = [1]
    f.append(len(sentence))
    f.append(sentence.count('!'))
    f.append(sum(c.isupper() for c in sentence))
    return f

In [52]:
y = []
X = []

for d in dataset:
    for spoiler,sentence in d['review_sentences']:
        X.append(feature5(sentence))
        y.append(spoiler)

In [53]:
X[:10]

[[1, 121, 0, 4],
 [1, 114, 0, 2],
 [1, 157, 0, 2],
 [1, 155, 0, 3],
 [1, 88, 0, 2],
 [1, 75, 0, 1],
 [1, 137, 0, 6],
 [1, 58, 0, 1],
 [1, 108, 0, 1],
 [1, 25, 0, 2]]

In [54]:
mod = linear_model.LogisticRegression(C=1.0, class_weight='balanced')

In [55]:
mod.fit(X, y)

In [56]:
y_pred = mod.predict(X)

In [57]:
answers['Q5a'] = X[0]

In [58]:
TP = sum([(p and l) for (p,l) in zip(y_pred, y)])
FP = sum([(p and not l) for (p,l) in zip(y_pred, y)])
TN = sum([(not p and not l) for (p,l) in zip(y_pred, y)])
FN = sum([(not p and l) for (p,l) in zip(y_pred, y)])

In [59]:
TPR = TP / (TP + FN)
TNR = TN / (TN + FP)

BER = 1 - 1/2 * (TPR + TNR)
BER

0.4702652880062319

In [60]:
answers['Q5b'] = [TP, TN, FP, FN, BER]

In [61]:
assert len(answers['Q5a']) == 4
assertFloatList(answers['Q5b'], 5)

In [62]:
answers['Q5b']

[2384, 168945, 86232, 3615, 0.4702652880062319]

In [63]:
### 6

In [64]:
def feature6(review):
    f = []
    f += [x[0] for x in review['review_sentences'][:5]]
    f += feature5(review['review_sentences'][5][1])
    return f

In [65]:
y = []
X = []

for d in dataset:
    sentences = d['review_sentences']
    if len(sentences) < 6: continue
    X.append(feature6(d))
    y.append(sentences[5][0])

mod = linear_model.LogisticRegression(C=1.0, class_weight='balanced')
mod.fit(X, y)
y_pred = mod.predict(X)

In [66]:
def rates(y_pred, y):
    TP = sum([(p and l) for (p,l) in zip(y_pred, y)])
    FP = sum([(p and not l) for (p,l) in zip(y_pred, y)])
    TN = sum([(not p and not l) for (p,l) in zip(y_pred, y)])
    FN = sum([(not p and l) for (p,l) in zip(y_pred, y)])
    
    TPR = TP / (TP + FN)
    TNR = TN / (TN + FP)

    BER = 1 - 1/2 * (TPR + TNR)
    BER
    return (TP, FP, TN, FN, BER)

In [67]:
_, _, _, _, BER = rates(y_pred, y)
BER

0.170849126718692

In [68]:
answers['Q6a'] = X[0]

In [69]:
answers['Q6b'] = BER

In [70]:
assert len(answers['Q6a']) == 9
assertFloat(answers['Q6b'])

In [71]:
### 7

In [72]:
# 50/25/25% train/valid/test split
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [73]:
bers = []
bestC = None
bestValidBER = None
bestBER = None

for c in [0.01, 0.1, 1, 10, 100]:
    mod = linear_model.LogisticRegression(C=1.0, class_weight='balanced')
    mod.fit(Xtrain, ytrain)
    y_pred_valid = mod.predict(Xvalid)
    y_pred_test = mod.predict(Xtest)
    _,_,_,_,ber_valid = rates(y_pred_valid, yvalid)
    _,_,_,_,ber_test = rates(y_pred_test, ytest)
    if bestC == None or ber_valid < bestValidBER:
        bestC = c
        bestValidBER = ber_valid
        bestBER = ber_test
    bers.append(ber_valid)

In [74]:
answers['Q7'] = bers + [bestC] + [bestBER]

In [75]:
answers['Q7']

[0.14353872972563286,
 0.14353872972563286,
 0.14353872972563286,
 0.14353872972563286,
 0.14353872972563286,
 0.01,
 0.21402101510460647]

In [76]:
assertFloatList(answers['Q7'], 7)

In [77]:
### 8

In [78]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [79]:
# 75/25% train/test split
dataTrain = dataset[:15000]
dataTest = dataset[15000:]

In [80]:
# A few utilities

itemAverages = defaultdict(list)
ratingMean = []

for d in dataTrain:
    itemAverages[d['book_id']].append(d['rating'])
    ratingMean.append(d['rating'])

for i in itemAverages:
    itemAverages[i] = sum(itemAverages[i]) / len(itemAverages[i])

ratingMean = sum(ratingMean) / len(ratingMean)

In [81]:
reviewsPerUser = defaultdict(list)
usersPerItem = defaultdict(set)

for d in dataTrain:
    u,i = d['user_id'], d['book_id']
    reviewsPerUser[u].append(d)
    usersPerItem[i].add(u)

In [82]:
# From my HW2 solution, welcome to reuse
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['book_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        if item in itemAverages:
            return itemAverages[item]
        else:
            return ratingMean

In [83]:
predictions = [predictRating(d['user_id'], d['book_id']) for d in dataTest]

In [84]:
labels = [d['rating'] for d in dataTest]

In [85]:
answers["Q8"] = MSE(predictions, labels)

In [86]:
assertFloat(answers["Q8"])

In [87]:
answers["Q8"]

1.8164934412791371

In [88]:
### 9

In [89]:
seA = []
seB = []
seC = []

for d in dataTest:
    u,i = d['user_id'], d['book_id']
    rat = d['rating']
    pred = predictRating(u, i)
    se = (pred - rat)**2
    
    if not (i in itemAverages):
        seA.append(se)
    
    if len(usersPerItem[i]) >= 1 and len(usersPerItem[i]) <= 5:
        seB.append(se)
    
    if len(usersPerItem[i]) > 5:
        seC.append(se)

In [90]:
seA = sum(seA) / len(seA)
seB = sum(seB) / len(seB)
seC = sum(seC) / len(seC)

In [91]:
answers["Q9"] = [seA, seB, seC]

In [92]:
assertFloatList(answers["Q9"], 3)

In [93]:
seA, seB, seC

(1.742012484444442, 2.052681872005889, 1.452063234864505)

In [94]:
### 10

In [95]:
answers["Q10"] = ("describe your solution", itsMSE)

NameError: name 'itsMSE' is not defined

In [None]:
assert type(answers["Q10"][0]) == str
assertFloat(answers["Q10"][1])

In [None]:
f = open("answers_midterm.txt", 'w')
f.write(str(answers) + '\n')
f.close()