In [1]:
import random
from sklearn import linear_model
from matplotlib import pyplot as plt
from collections import defaultdict
import gzip

In [2]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [3]:
answers = {}

In [4]:
def parseData(fname):
    for l in open(fname):
        yield eval(l)

In [5]:
data = list(parseData("beer_50000.json"))

In [6]:
data[0]

{'review/appearance': 2.5,
 'beer/style': 'Hefeweizen',
 'review/palate': 1.5,
 'review/taste': 1.5,
 'beer/name': 'Sausa Weizen',
 'review/timeUnix': 1234817823,
 'beer/ABV': 5.0,
 'beer/beerId': '47986',
 'beer/brewerId': '10325',
 'review/timeStruct': {'isdst': 0,
  'mday': 16,
  'hour': 20,
  'min': 57,
  'sec': 3,
  'mon': 2,
  'year': 2009,
  'yday': 47,
  'wday': 0},
 'review/overall': 1.5,
 'review/text': 'A lot of foam. But a lot.\tIn the smell some banana, and then lactic and tart. Not a good start.\tQuite dark orange in color, with a lively carbonation (now visible, under the foam).\tAgain tending to lactic sourness.\tSame for the taste. With some yeast and banana.',
 'user/profileName': 'stcules',
 'review/aroma': 2.0}

In [7]:
random.seed(0)
random.shuffle(data)

In [8]:
# Already 50/25/25 split
dataTrain = data[:25000]
dataValid = data[25000:37500]
dataTest = data[37500:]

In [9]:
yTrain = [d['beer/ABV'] > 7 for d in dataTrain]
yValid = [d['beer/ABV'] > 7 for d in dataValid]
yTest = [d['beer/ABV'] > 7 for d in dataTest]

In [10]:
categoryCounts = defaultdict(int)
for d in data:
    categoryCounts[d['beer/style']] += 1

In [11]:
categories = [c for c in categoryCounts if categoryCounts[c] > 1000]

In [12]:
catID = dict(zip(list(categories),range(len(categories))))

In [13]:
def feat(d):
    # In my solution, I wrote a reusable function that takes parameters to generate features for each question
    # Feel free to keep or discard
    cat = d['beer/style']
    ID = catID[cat] if cat in categories else 0
    # one-hot encoding
    encode = [0 for x in range(len(catID) -1)]
    if ID > 0:
        encode[ID - 1] = 1
    return [1] + encode

In [14]:
def pipeline(reg):
    X = [feat(d) for d in dataTrain]
    y = [d['beer/ABV'] > 7 for d in dataTrain]
    model = linear_model.LogisticRegression(C=reg, class_weight='balanced')
    model.fit(X, y)
    
    X_test = [feat(d) for d in dataTest]
    X_valid = [feat(d) for d in dataValid]
    y_test = [d['beer/ABV'] > 7 for d in dataTest]
    y_valid = [d['beer/ABV'] > 7 for d in dataValid]
    
    test_pred = model.predict(X_test)
    valid_pred = model.predict(X_valid)
    # Report the accuracy
    print(model.score(X_test, y_test))
    
    TP = sum([a == b for a,b in zip(test_pred, y_test) if b == True])
    TN = sum([a == b for a,b in zip(test_pred, y_test) if b == False])
    FP = sum([a != b for a,b in zip(test_pred, y_test) if b == False])
    FN = sum([a != b for a,b in zip(test_pred, y_test) if b == True])    
    
    BER_test = 1 / 2 * (FP / (FP + TN) + FN / (FN + TP))
    # print(BER_test)
    
    TP = sum([a == b for a,b in zip(valid_pred, y_valid) if b == True])
    TN = sum([a == b for a,b in zip(valid_pred, y_valid) if b == False])
    FP = sum([a != b for a,b in zip(valid_pred, y_valid) if b == False])
    FN = sum([a != b for a,b in zip(valid_pred, y_valid) if b == True])    
    
    BER_valid = 1 / 2 * (FP / (FP + TN) + FN / (FN + TP))
    # print(BER_valid)
    return model, BER_valid, BER_test

In [15]:
### Question 1

In [16]:
mod, validBER, testBER = pipeline(10)

0.85096


In [17]:
answers['Q1'] = [validBER, testBER]
answers['Q1']

[0.16130237168160533, 0.16078380246088317]

In [18]:
assertFloatList(answers['Q1'], 2)

In [19]:
### Question 2

In [20]:
dataTrain[0]

{'review/appearance': 4.0,
 'beer/style': 'Belgian Pale Ale',
 'review/palate': 4.0,
 'review/taste': 4.0,
 'beer/name': 'La Binchoise Blonde Tradition',
 'review/timeUnix': 1210043435,
 'beer/ABV': 6.5,
 'beer/beerId': '7693',
 'beer/brewerId': '3282',
 'review/timeStruct': {'isdst': 0,
  'mday': 6,
  'hour': 3,
  'min': 10,
  'sec': 35,
  'mon': 5,
  'year': 2008,
  'yday': 127,
  'wday': 1},
 'review/overall': 4.0,
 'review/text': 'From the 11.2oz stubby.\tPours a straw/golden color with a nice head that soon settles to a nice thin crown. Pleasing aroma of yeast and fruit. Taste melds fruit notes with a nice carbonation and peppery hop profile. Slightly dry finish, but satisfying leaving you wanting another drink. Recommended.',
 'user/profileName': 'sinistermadman',
 'review/aroma': 4.0}

In [21]:
max_length = max([len(d['review/text']) for d in dataTrain])
max_length

4721

In [22]:
def feat(d):
    cat = d['beer/style']
    ID = catID[cat] if cat in categories else 0
    # one-hot encoding
    encode = [0 for x in range(len(catID) -1)]
    if ID > 0:
        encode[ID - 1] = 1
    review_scores = [d['review/appearance'], d['review/palate'], d['review/taste'],
                    d['review/overall'], d['review/aroma']]
    
    return [1] + encode + review_scores + [len(d['review/text']) / max_length]

In [23]:
def pipeline(reg):
    X = [feat(d) for d in dataTrain]
    y = [d['beer/ABV'] > 7 for d in dataTrain]
    model = linear_model.LogisticRegression(C=reg, class_weight='balanced')
    model.fit(X, y)
    
    X_test = [feat(d) for d in dataTest]
    X_valid = [feat(d) for d in dataValid]
    y_test = [d['beer/ABV'] > 7 for d in dataTest]
    y_valid = [d['beer/ABV'] > 7 for d in dataValid]
    
    test_pred = model.predict(X_test)
    valid_pred = model.predict(X_valid)
    # Report the accuracy
    print(model.score(X_test, y_test))
    
    TP = sum([a == b for a,b in zip(test_pred, y_test) if b == True])
    TN = sum([a == b for a,b in zip(test_pred, y_test) if b == False])
    FP = sum([a != b for a,b in zip(test_pred, y_test) if b == False])
    FN = sum([a != b for a,b in zip(test_pred, y_test) if b == True])    
    
    BER_test = 1 / 2 * (FP / (FP + TN) + FN / (FN + TP))
    # print(BER_test)
    
    TP = sum([a == b for a,b in zip(valid_pred, y_valid) if b == True])
    TN = sum([a == b for a,b in zip(valid_pred, y_valid) if b == False])
    FP = sum([a != b for a,b in zip(valid_pred, y_valid) if b == False])
    FN = sum([a != b for a,b in zip(valid_pred, y_valid) if b == True])    
    
    BER_valid = 1 / 2 * (FP / (FP + TN) + FN / (FN + TP))
    # print(BER_valid)
    return model, BER_valid, BER_test

In [24]:
mod, validBER, testBER = pipeline(10)

0.85704


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
answers['Q2'] = [validBER, testBER]
answers['Q2']

[0.14929168412822388, 0.150572815931349]

In [26]:
assertFloatList(answers['Q2'], 2)

In [27]:
### Question 3

In [28]:
naive_classifier = 0.5
valid_BER = 0.5
test_BER = 0.5
best_C = 0
model = None
for c in [0.001, 0.01, 0.1, 1, 10]:
    mod, validBER, testBER = pipeline(c)
    print(f"Validation BER for C={c} is {validBER}")
    mean_BER = (1/2) * (validBER + testBER)
    if mean_BER < naive_classifier:
        naive_classifier = mean_BER
        valid_BER = validBER
        test_BER = testBER
        best_C = c
        model = mod

0.802
Validation BER for C=0.001 is 0.1967815722017428
0.856
Validation BER for C=0.01 is 0.15089049054483042


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.85744
Validation BER for C=0.1 is 0.15120531303268894


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.85704
Validation BER for C=1 is 0.15012886140494944
0.85704
Validation BER for C=10 is 0.14929168412822388


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
bestC = best_C
bestC

10

In [30]:
mod, validBER, testBER = model, valid_BER, test_BER

In [31]:
answers['Q3'] = [bestC, validBER, testBER]
answers['Q3']

[10, 0.14929168412822388, 0.150572815931349]

In [32]:
assertFloatList(answers['Q3'], 3)

In [33]:
### Question 4

In [34]:
def feat(d):
    cat = d['beer/style']
    ID = catID[cat] if cat in categories else 0
    # one-hot encoding
    encode = [0 for x in range(len(catID) -1)]
    if ID > 0:
        encode[ID - 1] = 1
    review_scores = [d['review/appearance'], d['review/palate'], d['review/taste'],
                    d['review/overall'], d['review/aroma']]
    
    return [1] + review_scores + [len(d['review/text']) / max_length]

def pipeline(reg):
    X = [feat(d) for d in dataTrain]
    y = [d['beer/ABV'] > 7 for d in dataTrain]
    model = linear_model.LogisticRegression(C=reg, class_weight='balanced')
    model.fit(X, y)
    
    X_test = [feat(d) for d in dataTest]
    X_valid = [feat(d) for d in dataValid]
    y_test = [d['beer/ABV'] > 7 for d in dataTest]
    y_valid = [d['beer/ABV'] > 7 for d in dataValid]
    
    test_pred = model.predict(X_test)
    valid_pred = model.predict(X_valid)
    # Report the accuracy
    print(model.score(X_test, y_test))
    
    TP = sum([a == b for a,b in zip(test_pred, y_test) if b == True])
    TN = sum([a == b for a,b in zip(test_pred, y_test) if b == False])
    FP = sum([a != b for a,b in zip(test_pred, y_test) if b == False])
    FN = sum([a != b for a,b in zip(test_pred, y_test) if b == True])    
    
    BER_test = 1 / 2 * (FP / (FP + TN) + FN / (FN + TP))
    # print(BER_test)
    
    TP = sum([a == b for a,b in zip(valid_pred, y_valid) if b == True])
    TN = sum([a == b for a,b in zip(valid_pred, y_valid) if b == False])
    FP = sum([a != b for a,b in zip(valid_pred, y_valid) if b == False])
    FN = sum([a != b for a,b in zip(valid_pred, y_valid) if b == True])    
    
    BER_valid = 1 / 2 * (FP / (FP + TN) + FN / (FN + TP))
    # print(BER_valid)
    return model, BER_valid, BER_test

In [35]:
mod, validBER, testBER_noCat = pipeline(1)

0.68456


In [36]:
def feat(d):
    cat = d['beer/style']
    ID = catID[cat] if cat in categories else 0
    # one-hot encoding
    encode = [0 for x in range(len(catID) -1)]
    if ID > 0:
        encode[ID - 1] = 1
    review_scores = [d['review/appearance'], d['review/palate'], d['review/taste'],
                    d['review/overall'], d['review/aroma']]
    
    return [1] + encode + [len(d['review/text']) / max_length]

def pipeline(reg):
    X = [feat(d) for d in dataTrain]
    y = [d['beer/ABV'] > 7 for d in dataTrain]
    model = linear_model.LogisticRegression(C=reg, class_weight='balanced')
    model.fit(X, y)
    
    X_test = [feat(d) for d in dataTest]
    X_valid = [feat(d) for d in dataValid]
    y_test = [d['beer/ABV'] > 7 for d in dataTest]
    y_valid = [d['beer/ABV'] > 7 for d in dataValid]
    
    test_pred = model.predict(X_test)
    valid_pred = model.predict(X_valid)
    # Report the accuracy
    print(model.score(X_test, y_test))
    
    TP = sum([a == b for a,b in zip(test_pred, y_test) if b == True])
    TN = sum([a == b for a,b in zip(test_pred, y_test) if b == False])
    FP = sum([a != b for a,b in zip(test_pred, y_test) if b == False])
    FN = sum([a != b for a,b in zip(test_pred, y_test) if b == True])    
    
    BER_test = 1 / 2 * (FP / (FP + TN) + FN / (FN + TP))
    # print(BER_test)
    
    TP = sum([a == b for a,b in zip(valid_pred, y_valid) if b == True])
    TN = sum([a == b for a,b in zip(valid_pred, y_valid) if b == False])
    FP = sum([a != b for a,b in zip(valid_pred, y_valid) if b == False])
    FN = sum([a != b for a,b in zip(valid_pred, y_valid) if b == True])    
    
    BER_valid = 1 / 2 * (FP / (FP + TN) + FN / (FN + TP))
    # print(BER_valid)
    return model, BER_valid, BER_test

In [37]:
mod, validBER, testBER_noReview = pipeline(1)

0.85


In [38]:
def feat(d):
    cat = d['beer/style']
    ID = catID[cat] if cat in categories else 0
    # one-hot encoding
    encode = [0 for x in range(len(catID) -1)]
    if ID > 0:
        encode[ID - 1] = 1
    review_scores = [d['review/appearance'], d['review/palate'], d['review/taste'],
                    d['review/overall'], d['review/aroma']]
    
    return [1] + encode + review_scores

def pipeline(reg):
    X = [feat(d) for d in dataTrain]
    y = [d['beer/ABV'] > 7 for d in dataTrain]
    model = linear_model.LogisticRegression(C=reg, class_weight='balanced')
    model.fit(X, y)
    
    X_test = [feat(d) for d in dataTest]
    X_valid = [feat(d) for d in dataValid]
    y_test = [d['beer/ABV'] > 7 for d in dataTest]
    y_valid = [d['beer/ABV'] > 7 for d in dataValid]
    
    test_pred = model.predict(X_test)
    valid_pred = model.predict(X_valid)
    # Report the accuracy
    print(model.score(X_test, y_test))
    
    TP = sum([a == b for a,b in zip(test_pred, y_test) if b == True])
    TN = sum([a == b for a,b in zip(test_pred, y_test) if b == False])
    FP = sum([a != b for a,b in zip(test_pred, y_test) if b == False])
    FN = sum([a != b for a,b in zip(test_pred, y_test) if b == True])    
    
    BER_test = 1 / 2 * (FP / (FP + TN) + FN / (FN + TP))
    # print(BER_test)
    
    TP = sum([a == b for a,b in zip(valid_pred, y_valid) if b == True])
    TN = sum([a == b for a,b in zip(valid_pred, y_valid) if b == False])
    FP = sum([a != b for a,b in zip(valid_pred, y_valid) if b == False])
    FN = sum([a != b for a,b in zip(valid_pred, y_valid) if b == True])    
    
    BER_valid = 1 / 2 * (FP / (FP + TN) + FN / (FN + TP))
    # print(BER_valid)
    return model, BER_valid, BER_test

In [39]:
mod, validBER, testBER_noLength = pipeline(1)

0.85376


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
answers['Q4'] = [testBER_noCat, testBER_noReview, testBER_noLength]
answers['Q4']

[0.31387501072485213, 0.16077986153652415, 0.15419420344633064]

In [41]:
assertFloatList(answers['Q4'], 3)

In [42]:
### Question 5

In [43]:
path = "amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"
f = gzip.open(path, 'rt', encoding="utf8")

header = f.readline()
header = header.strip().split('\t')

In [44]:
header

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [45]:
dataset = []

pairsSeen = set()

for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    ui = (d['customer_id'], d['product_id'])
    if ui in pairsSeen:
        print("Skipping duplicate user/item:", ui)
        continue
    pairsSeen.add(ui)
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

Skipping duplicate user/item: ('46953315', 'B00QM3CNN6')
Skipping duplicate user/item: ('31616428', 'B0026RB0G8')
Skipping duplicate user/item: ('47240912', 'B008I653SC')
Skipping duplicate user/item: ('14503091', 'B003FRMRC4')
Skipping duplicate user/item: ('38538360', 'B00HVLUR86')
Skipping duplicate user/item: ('43448024', 'B00HVLUR86')
Skipping duplicate user/item: ('51525270', 'B00HVLUR86')
Skipping duplicate user/item: ('20652160', 'B004OU2IQG')
Skipping duplicate user/item: ('10964440', 'B00HVLUR86')
Skipping duplicate user/item: ('20043677', 'B00HVLUR86')
Skipping duplicate user/item: ('44796499', 'B00HVLUSGM')
Skipping duplicate user/item: ('29066899', 'B0002CZSYO')
Skipping duplicate user/item: ('10385056', 'B004OU2IQG')
Skipping duplicate user/item: ('1658551', 'B00HVLURL8')
Skipping duplicate user/item: ('907433', 'B00N9Q2E5G')
Skipping duplicate user/item: ('39412969', 'B00HVLUR86')
Skipping duplicate user/item: ('4901688', 'B00HVLUR86')
Skipping duplicate user/item: ('234

In [46]:
dataTrain = dataset[:int(len(dataset)*0.9)]
dataTest = dataset[int(len(dataset)*0.9):]

In [47]:
dataTrain[0]

{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}

In [48]:
# Feel free to keep or discard

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair
reviewsPerUser = defaultdict(list)

for d in dataTrain:
    user,item = d['customer_id'], d['product_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    itemNames[item] = d['product_title']
    
for d in dataset:
    user,item = d['customer_id'], d['product_id']
    itemNames[item] = d['product_title']
    ratingDict[(user, item)] = d['star_rating']
    reviewsPerUser[user].append(d)

In [49]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u, i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u, i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

ratingMean = sum([d['star_rating'] for d in dataTrain]) / len(dataTrain)

In [50]:
dataTrain[0]

{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}

In [51]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [52]:
def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append((sim, i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [53]:
query = 'B00KCHRKD6'

In [54]:
ms = mostSimilar(query, 10)

In [55]:
answers['Q5'] = ms
answers['Q5']

[(0.015228426395939087, 'B00H7NFDKA'),
 (0.014492753623188406, 'B00QKVV3HC'),
 (0.014492753623188406, 'B00GXRMD7W'),
 (0.014084507042253521, 'B00H7ILRRI'),
 (0.014084507042253521, 'B0057RUMPO'),
 (0.014084507042253521, 'B000B6DTYW'),
 (0.013888888888888888, 'B00L2708TI'),
 (0.013513513513513514, 'B009Z1KKWI'),
 (0.013513513513513514, 'B000VYINCW'),
 (0.013333333333333334, 'B003F2BDZQ')]

In [56]:
assertFloatList([m[0] for m in ms], 10)

In [57]:
### Question 6

In [58]:
def MSE(y, ypred):
    differences = [(x-y)**2 for x,y in zip(ypred, y)]
    return sum(differences) / len(differences)

In [59]:
def predictRating(user,item):
    ratings = []
    similarities = []
    if item not in itemAverages:
        return ratingMean
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        if i2 in itemAverages:
            ratings.append(d['star_rating'] - itemAverages[i2])
        else:
            ratings.append(d['star_rating'] - ratingMean)
        similarities.append(Jaccard(usersPerItem[item], usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings, similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        return itemAverages[item]

In [60]:
alwaysPredictMean = [ratingMean for d in dataTest]

In [61]:
simPredictions = [predictRating(d['customer_id'], d['product_id']) for d in dataTest]

In [62]:
labels = [d['star_rating'] for d in dataTest]

In [63]:
answers['Q6'] = MSE(simPredictions, labels)
answers['Q6']

1.7275966297419991

In [64]:
assertFloat(answers['Q6'])

In [65]:
### Question 7

In [66]:
dataTrain[0]

{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}

In [67]:
from datetime import datetime
import math

In [68]:
def decay_function(time_1, time_2):
    return math.exp(-abs(time_1 - time_2))

In [69]:
def predictRatingV2(user,item, item_time):
    ratings = []
    similarities = []
    if item not in itemAverages:
        return ratingMean
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        i2_time = datetime.strptime(d['review_date'], "%Y-%m-%d").timestamp()
        item_time_stamp = datetime.strptime(item_time, "%Y-%m-%d").timestamp()
        if i2 == item: continue
        if i2 in itemAverages:
            ratings.append(d['star_rating'] - itemAverages[i2])
        else:
            ratings.append(d['star_rating'] - ratingMean)
        similarities.append(Jaccard(usersPerItem[item], usersPerItem[i2]) * decay_function(item_time_stamp, i2_time))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings, similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        return itemAverages[item]

In [70]:
simPredictions = [predictRatingV2(d['customer_id'], d['product_id'], d['review_date']) for d in dataTest]

In [71]:
labels = [d['star_rating'] for d in dataTest]

In [72]:
def MSE(y, ypred):
    differences = [(x-y)**2 for x,y in zip(ypred, y)]
    return sum(differences) / len(differences)

In [73]:
itsMSE = MSE(simPredictions, labels)
itsMSE

1.7062703182590517

In [74]:
answers['Q7'] = ["The function basically takes in two timestamps calculated by the library. Then, I calculate the absolute difference between two times and then apply the result to the standard decay function model to obtain the weigh.", itsMSE]
answers['Q7']

['The function basically takes in two timestamps calculated by the library. Then, I calculate the absolute difference between two times and then apply the result to the standard decay function model to obtain the weigh.',
 1.7062703182590517]

In [75]:
assertFloat(answers['Q7'][1])

In [77]:
f = open("answers_hw2.txt", 'w')
f.write(str(answers) + '\n')
f.close()