In [1]:
import json
from collections import defaultdict
from sklearn import linear_model
import numpy
import random
import gzip
import dateutil.parser
import math

In [2]:
answers = {}

In [3]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [4]:
### Question 1

In [5]:
f = gzip.open("fantasy_10000.json.gz")
dataset = []
for l in f:
    dataset.append(json.loads(l))

In [6]:
dataset[0]

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'book_id': '18245960',
 'review_id': 'dfdbb7b0eb5a7e4c26d59a937e2e5feb',
 'rating': 5,
 'review_text': 'This is a special book. It started slow for about the first third, then in the middle third it started to get interesting, then the last third blew my mind. This is what I love about good science fiction - it pushes your thinking about where things can go. \n It is a 2015 Hugo winner, and translated from its original Chinese, which made it interesting in just a different way from most things I\'ve read. For instance the intermixing of Chinese revolutionary history - how they kept accusing people of being "reactionaries", etc. \n It is a book about science, and aliens. The science described in the book is impressive - its a book grounded in physics and pretty accurate as far as I could tell. Though when it got to folding protons into 8 dimensions I think he was just making stuff up - interesting to think about though. \n But what would 

In [7]:
max_len = max([len(d['review_text']) for d in dataset])
max_len

14306

In [8]:
def feature(datum):
    a = len(datum['review_text']) / max_len
    return [1, a]

In [9]:
X = [feature(data) for data in dataset]
Y = [d['rating'] for d in dataset]

In [10]:
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(X, Y)
theta = model.coef_
theta

array([3.68568136, 0.98335392])

In [11]:
# MSE and R^2
y_pred = model.predict(X)

# Sum of squared errors (SSE)
sse = sum([x ** 2 for x in (Y - y_pred)])

MSE = sse / len(Y)
MSE

1.5522086622355353

In [12]:
answers['Q1'] = [theta[0], theta[1], MSE]

In [13]:
assertFloatList(answers['Q1'], 3)

In [14]:
### Question 2

In [15]:
for d in dataset:
    t = dateutil.parser.parse(d['date_added'])
    d['parsed_date'] = t


In [16]:
lst = [d['parsed_date'] for d in dataset]
weekdays = [date.weekday() for date in lst]
months = [date.month for date in lst]
print(f"weekdays interval is {max(weekdays) - min(weekdays)}")
print(f"month interval is {max(months) - min(months)}")

weekdays interval is 6
month interval is 11


In [17]:
max_len = max([len(d['review_text']) for d in dataset])
max_len

14306

In [18]:
def feature(datum):
    length = len(datum['review_text']) / max_len
    
    weekday = datum['parsed_date'].weekday()
    month = datum['parsed_date'].month
    weekday_encode = [0 for x in range(6)]
    month_encode = [0 for x in range(11)]
    
    d_idx = weekday - 1
    if d_idx >= 0:
        weekday_encode[d_idx] = 1
    
    m_idx = month - 2
    if m_idx >= 0:
        month_encode[m_idx] = 1
    return [1, length] + weekday_encode + month_encode

In [19]:
X2 = [feature(d) for d in dataset]
Y2 = [d['rating'] for d in dataset]

In [20]:
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(X2, Y2)
theta = model.coef_
theta

array([ 3.62796241,  0.99449458,  0.05985561,  0.13599879,  0.10506964,
        0.1325751 ,  0.0448498 ,  0.10021126, -0.04593548,  0.05477171,
       -0.12316252, -0.09921527,  0.04069879, -0.01199552,  0.00727535,
       -0.01710954, -0.00457752, -0.10991136, -0.09331423])

In [21]:
# MSE and R^2
y_pred = model.predict(X2)

# Sum of squared errors (SSE)
sse = sum([x ** 2 for x in (Y2 - y_pred)])

mse2 = sse / len(Y)
mse2

1.546631549848752

In [22]:
answers['Q2'] = [X2[0], X2[1]]

In [23]:
assertFloatList(answers['Q2'][0], 19)
assertFloatList(answers['Q2'][1], 19)

In [24]:
### Question 3

In [25]:
def feature(datum):
    length = len(datum['review_text']) / max_len
    
    weekday = datum['parsed_date'].weekday()
    month = datum['parsed_date'].month
    return [1, length, weekday, month]

In [26]:
X3 = [feature(d) for d in dataset]
Y3 = [d['rating'] for d in dataset]

In [27]:
X3[:10]

[[1, 0.14581294561722355, 6, 7],
 [1, 0.10631902698168601, 2, 9],
 [1, 0.1061792254997903, 5, 4],
 [1, 0.1251922270376066, 1, 12],
 [1, 0.12316510555011884, 0, 7],
 [1, 0.0328533482454914, 1, 7],
 [1, 0.05752830980008388, 2, 3],
 [1, 0.03718719418425835, 0, 12],
 [1, 0.04305885642387809, 6, 10],
 [1, 0.03830560603942402, 1, 9]]

In [28]:
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(X3, Y3)
theta = model.coef_
theta

array([ 3.68774185,  0.98716703,  0.00968863, -0.00457118])

In [29]:
# MSE and R^2
y_pred = model.predict(X3)

# Sum of squared errors (SSE)
sse = sum([x ** 2 for x in (Y3 - y_pred)])

mse3 = sse / len(Y3)
mse3

1.5516353711453312

In [30]:
answers['Q3'] = [mse2, mse3]

In [31]:
assertFloatList(answers['Q3'], 2)

In [32]:
### Question 4

In [33]:
random.seed(0)
random.shuffle(dataset)

In [34]:
X2 = [feature(d) for d in dataset]
X3 = [feature(d) for d in dataset]
Y = [d['rating'] for d in dataset]

In [35]:
train2, test2 = X2[:len(X2)//2], X2[len(X2)//2:]
train3, test3 = X3[:len(X3)//2], X3[len(X3)//2:]
trainY, testY = Y[:len(Y)//2], Y[len(Y)//2:]

In [36]:
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(train2, trainY)
theta = model.coef_
print(theta)

# MSE and R^2
y_pred = model.predict(test2)

# Sum of squared errors (SSE)
sse = sum([x ** 2 for x in (testY - y_pred)])

test_mse2 = sse / len(testY)
test_mse2

[ 3.74994449  0.72506625  0.00510598 -0.00464236]


1.6282919476175841

In [37]:
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(train3, trainY)
theta = model.coef_
print(theta)

# MSE and R^2
y_pred = model.predict(test3)

# Sum of squared errors (SSE)
sse = sum([x ** 2 for x in (testY - y_pred)])

test_mse3 = sse / len(testY)
test_mse3

[ 3.74994449  0.72506625  0.00510598 -0.00464236]


1.6282919476175841

In [38]:
answers['Q4'] = [test_mse2, test_mse3]

In [39]:
assertFloatList(answers['Q4'], 2)

In [40]:
### Question 5

In [41]:
f = open("beer_50000.json")
dataset = []
for l in f:
    dataset.append(eval(l))

In [42]:
dataset

[{'review/appearance': 2.5,
  'beer/style': 'Hefeweizen',
  'review/palate': 1.5,
  'review/taste': 1.5,
  'beer/name': 'Sausa Weizen',
  'review/timeUnix': 1234817823,
  'beer/ABV': 5.0,
  'beer/beerId': '47986',
  'beer/brewerId': '10325',
  'review/timeStruct': {'isdst': 0,
   'mday': 16,
   'hour': 20,
   'min': 57,
   'sec': 3,
   'mon': 2,
   'year': 2009,
   'yday': 47,
   'wday': 0},
  'review/overall': 1.5,
  'review/text': 'A lot of foam. But a lot.\tIn the smell some banana, and then lactic and tart. Not a good start.\tQuite dark orange in color, with a lively carbonation (now visible, under the foam).\tAgain tending to lactic sourness.\tSame for the taste. With some yeast and banana.',
  'user/profileName': 'stcules',
  'review/aroma': 2.0},
 {'review/appearance': 3.0,
  'beer/style': 'English Strong Ale',
  'review/palate': 3.0,
  'review/taste': 3.0,
  'beer/name': 'Red Moon',
  'review/timeUnix': 1235915097,
  'beer/ABV': 6.2,
  'beer/beerId': '48213',
  'beer/brewerId':

In [43]:
X = [[1, len(d['review/text'])] for d in dataset]
y = [d['review/overall'] >= 4 for d in dataset]

In [44]:
X[:10]

[[1, 262],
 [1, 338],
 [1, 396],
 [1, 401],
 [1, 1145],
 [1, 728],
 [1, 471],
 [1, 853],
 [1, 472],
 [1, 1035]]

In [45]:
model = linear_model.LogisticRegression(class_weight='balanced')
model.fit(X, y)
theta = model.coef_
theta

array([[-0.12545388,  0.00035402]])

In [46]:
train_predictions = model.predict(X)

In [47]:
TP = sum([a == b for a,b in zip(train_predictions, y) if b == True])
TN = sum([a == b for a,b in zip(train_predictions, y) if b == False])
FP = sum([a != b for a,b in zip(train_predictions, y) if b == False])
FN = sum([a != b for a,b in zip(train_predictions, y) if b == True])

In [48]:
BER = 1 / 2 * (FP / (FP + TN) + FN / (FN + TP))

In [49]:
answers['Q5'] = [TP, TN, FP, FN, BER]

In [50]:
assertFloatList(answers['Q5'], 5)

In [51]:
### Question 6

In [52]:
scores = model.decision_function(X)
scores[:10]

array([-0.15815583, -0.13125069, -0.11071781, -0.10894774,  0.15443946,
        0.00681518, -0.08416669,  0.05106707, -0.08381267,  0.11549781])

In [53]:
scorelabels = list(zip(scores, y))
scorelabels.sort(reverse=True)
scorelabels[:10]

[(1.4203973087838948, True),
 (1.408714811886357, True),
 (1.3478242219961605, True),
 (1.3127767313035474, True),
 (1.2858715869334605, True),
 (1.2582584124483713, True),
 (1.2239189518707605, False),
 (1.1994919129031816, True),
 (1.1803750997981197, True),
 (1.1743568438206002, False)]

In [54]:
sortedlabels = [x[1] for x in scorelabels]
sortedlabels[:10]

[True, True, True, True, True, True, False, True, True, False]

In [55]:
precs = []

In [56]:
for k in [1,100,1000,10000]:
    precs.append(sum(sortedlabels[:k]) / k)

In [57]:
answers['Q6'] = precs

In [58]:
assertFloatList(answers['Q6'], 4)

In [59]:
### Question 7

In [60]:
dataset[0]

{'review/appearance': 2.5,
 'beer/style': 'Hefeweizen',
 'review/palate': 1.5,
 'review/taste': 1.5,
 'beer/name': 'Sausa Weizen',
 'review/timeUnix': 1234817823,
 'beer/ABV': 5.0,
 'beer/beerId': '47986',
 'beer/brewerId': '10325',
 'review/timeStruct': {'isdst': 0,
  'mday': 16,
  'hour': 20,
  'min': 57,
  'sec': 3,
  'mon': 2,
  'year': 2009,
  'yday': 47,
  'wday': 0},
 'review/overall': 1.5,
 'review/text': 'A lot of foam. But a lot.\tIn the smell some banana, and then lactic and tart. Not a good start.\tQuite dark orange in color, with a lively carbonation (now visible, under the foam).\tAgain tending to lactic sourness.\tSame for the taste. With some yeast and banana.',
 'user/profileName': 'stcules',
 'review/aroma': 2.0}

In [61]:
styles = set([d['beer/style'] for d in dataset])
len(styles)

95

In [62]:
years = set([d['review/timeStruct']['year'] for d in dataset])
len(years)

14

In [63]:
def feature(datum):
    length = len(datum['review/text'])
    year = datum['review/timeStruct']['year']
    interval = max(years) - min(years)
    year_encode = [0 for x in range(interval)]
    if year - 1999 > 0:
        year_encode[year-1999-1]=1
    
    month = datum['review/timeStruct']['mon']
    month_encode = [0 for x in range(12)]
    if month - 1 > 0:
        month_encode[month-1-1]=1
    
    return [1, length] + year_encode + month_encode

In [64]:
X = [feature(d) for d in dataset]
y = [d['review/overall'] >= 4 for d in dataset]

In [65]:
model = linear_model.LogisticRegression(class_weight='balanced')
model.fit(X, y)
theta = model.coef_
theta

array([[-0.13097794,  0.00033221,  0.0070328 ,  0.01802125, -0.01425177,
        -0.12342961, -0.15986635, -0.05792455, -0.1928117 , -0.13000516,
         0.04601768,  0.17733399,  0.08005469,  0.14972701,  0.06782372,
         0.05586229,  0.02842379,  0.03325893,  0.0156555 , -0.03258929,
        -0.15867128, -0.07724513, -0.06820338,  0.07367379, -0.07665476,
        -0.03445798,  0.        ]])

In [66]:
train_predictions = model.predict(X)

In [67]:
TP = sum([a == b for a,b in zip(train_predictions, y) if b == True])
TN = sum([a == b for a,b in zip(train_predictions, y) if b == False])
FP = sum([a != b for a,b in zip(train_predictions, y) if b == False])
FN = sum([a != b for a,b in zip(train_predictions, y) if b == True])

In [68]:
BER2 = 1 / 2 * (FP / (FP + TN) + FN / (FN + TP))
BER2

0.45948290766129496

In [71]:
answers['Q7'] = ["I basically replicated the feature engineering part from \
regression to include more features from the dataset and then use \
one-hot encoding to incorporate the data into feature variable.", BER2]

In [72]:
f = open("answers_hw1.txt", 'w')
f.write(str(answers) + '\n')
f.close()