In [16]:
import gzip
import math
import matplotlib.pyplot as plt
import numpy
import random
import sklearn
import string
import json
from collections import defaultdict
#import gensim
#from gensim.models import Word2Vec
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE

In [14]:
dataDir = "/Users/Judy-Ccino412/Desktop/cse158/data/"
path = dataDir + "goodreads_reviews_comics_graphic.json.gz"
f = gzip.open(path, 'rt', encoding="utf8")

In [17]:
data = []
for l in f:
    d = json.loads(l)
    data.append(d)
    
f.close()

In [18]:
data = data[:20000]

In [19]:
training = data[:10000]
test = data[10000:]

In [24]:
data[0]

{'user_id': 'bafc2d50014200cda7cb2b6acd60cd73',
 'book_id': '6315584',
 'review_id': '72f1229aba5a88f9e72f0dcdc007dd22',
 'rating': 4,
 'review_text': "I've never really liked Spider-Man. I am, however, a huge fan of the Dresden Files. Jim Butcher is clever and sarcastic and probably the perfect choice to pen a superhero novel. I really enjoyed this book!",
 'date_added': 'Wed Aug 10 06:06:48 -0700 2016',
 'date_updated': 'Fri Aug 12 08:49:54 -0700 2016',
 'read_at': 'Fri Aug 12 08:49:54 -0700 2016',
 'started_at': 'Wed Aug 10 00:00:00 -0700 2016',
 'n_votes': 0,
 'n_comments': 0}

In [27]:
# Ignore capitalization and remove punctuation
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [30]:
# 1,000 most common words
words = [x[1] for x in counts[:1000]]

### Q1

#### (a) the 1,000 most common unigrams

In [52]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [48]:
# Extract unigrams
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    ws = r.split()

    for w in ws:
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [60]:
X_train = [feature(d) for d in training]
y_train = [d['rating'] for d in training]

In [61]:
X_test = [feature(d) for d in test]
y_test = [d['rating'] for d in test]

In [62]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)

In [63]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

In [64]:
# five most negative tokens --- q1a
wordSort[:5]

[(-0.536916069738369, 'boring'),
 (-0.5182917133361543, 'disappointing'),
 (-0.3460929055366586, 'worst'),
 (-0.3429126218755435, 'says'),
 (-0.3269298229656949, 'netgalley')]

In [65]:
# five most positive tokens --- q1a
wordSort[-5:]

[(0.3259992758708088, '5'),
 (0.34507261538475226, 'yourself'),
 (0.3660392608123058, 'beautifully'),
 (0.40150437550777796, 'mix'),
 (0.4124567319120735, 'wait')]

In [67]:
def MSE(predict, rating):
    SE = [(a-b)**2 for a,b in zip(predict, rating)]
    return sum(SE) / len(predict)

In [69]:
# q1a answer
MSE(predictions, y_test)

1.2406880672909828

#### (b) the 1,000 most common bigrams

In [116]:
# Extract Bigrams 
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    
    for w in ws2:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [117]:
# 1,000 most common tokens
words = [x[1] for x in counts[:1000]]
words[:10]

['of the',
 'in the',
 'the story',
 'and the',
 'is a',
 'to the',
 'this is',
 'to be',
 'it was',
 'with the']

In [104]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [105]:
# Extract bigrams
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]

    for w in ws2:
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [106]:
X_train = [feature(d) for d in training]
y_train = [d['rating'] for d in training]

In [108]:
X_test = [feature(d) for d in test]
y_test = [d['rating'] for d in test]

In [109]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)

In [110]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

In [111]:
# five most negative tokens --- q1b
wordSort[:5]

[(-1.0191491159458883, 'tuned for'),
 (-0.670437741494759, 'miss your'),
 (-0.6218927493203671, 'the worst'),
 (-0.5486320274003823, 'a bad'),
 (-0.5397464147555986, 'too many')]

In [112]:
# five most positive tokens --- q1b
wordSort[-5:]

[(0.5652500971915658, 'reviews as'),
 (0.5672097729576266, '5 stars'),
 (0.6312080751287221, 'stay tuned'),
 (0.6348684682302738, 'cant wait'),
 (0.7935427771070356, 'forget to')]

In [113]:
# q1b answer
MSE(predictions, y_test)

1.2931704233245027

#### (C) a model which uses a combination of unigrams and bigrams (i.e., some bigrams will be included if they are more popular than some unigrams, but the model dimensionality will still be 1,000).

In [119]:
# Extract Unigrams and Bigrams 
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    
    for w in ws + ws2:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [120]:
# 1,000 most common tokens
words = [x[1] for x in counts[:1000]]
words[:10]

['the', 'and', 'a', 'of', 'to', 'i', 'is', 'this', 'it', 'in']

In [121]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [122]:
# Extract bigrams
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]

    for w in ws + ws2 :
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [123]:
X_train = [feature(d) for d in training]
y_train = [d['rating'] for d in training]

In [124]:
X_test = [feature(d) for d in test]
y_test = [d['rating'] for d in test]

In [125]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)

In [126]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

In [127]:
# five most negative tokens 
wordSort[:5]

[(-0.3782305961695087, 'katies corner'),
 (-0.36508605822423024, 'share'),
 (-0.34896463855317117, 'what is'),
 (-0.32914369041652286, 'least'),
 (-0.2962503425695745, 'able to')]

In [128]:
# five most positive tokens 
wordSort[-5:]

[(0.31253965878410933, 'at least'),
 (0.35679000512083486, 'excellent'),
 (0.3709194034027991, 'wait'),
 (0.5071879004779356, 'able'),
 (0.5074106030159998, 'katies')]

In [129]:
# q1c answer
MSE(predictions, y_test)

1.2372390162711893

### Q2

In [145]:
rev = data[0]

In [140]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in training:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

In [141]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [142]:
words = [x[1] for x in counts[:1000]]

In [143]:
# Document frequency (df)
df = defaultdict(int)
for d in training:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in set(r.split()):
        df[w] += 1

In [172]:
# Term frequency (tf)
tf = defaultdict(int)
r = ''.join([c for c in rev['review_text'].lower() if not c in punctuation])

for w in r.split():
    # Note = rather than +=, different versions of tf could be used instead
    tf[w] += 1
    
# tf-idf
tfidf = dict(zip(words,[tf[w] * math.log2(len(training) / df[w]) for w in words]))
tfidfQuery = [tf[w] * math.log2(len(training) / df[w]) for w in words]

In [164]:
maxTf = [(tf[w],w) for w in words]
maxTf.sort(reverse=True)
maxTfIdf = [(tfidf[w],w) for w in words]
maxTfIdf.sort(reverse=True)
maxTfIdf[:10]

[(6.643856189774724, 'spiderman'),
 (6.601211852366231, 'choice'),
 (6.493296513199343, 'clever'),
 (5.9839316313723465, 'huge'),
 (5.423526234895169, 'perfect'),
 (5.227016447861896, 'superhero'),
 (4.787866492466244, 'probably'),
 (4.7027498788282935, 'fan'),
 (4.461163892258535, 'enjoyed'),
 (4.386845571568701, 'liked')]

In [175]:
def Cosine(x1,x2):
    numer = 0
    norm1 = 0
    norm2 = 0
    for a1,a2 in zip(x1,x2):
        numer += a1*a2
        norm1 += a1**2
        norm2 += a2**2
    if norm1*norm2:
        return numer / math.sqrt(norm1*norm2)
    return 0

In [183]:
# Find the other reviews in the corpus with the highest 
# cosine similarity between tf-idf vectors
similarities = []
for rev2 in training:
    tf = defaultdict(int)
    r = ''.join([c for c in rev2['review_text'].lower() if not c in punctuation])
    for w in r.split():
        # Note = rather than +=
        tf[w] += 1
    tfidf2 = [tf[w] * math.log2(len(training) / df[w]) for w in words]
    similarities.append((Cosine(tfidfQuery, tfidf2), rev2['review_text'], rev2['review_id']))

In [184]:
# q2 answer
similarities.sort(reverse=True)
similarities[1]

(0.3372386728480706,
 "Pitch perfect grief. I wasn't a huge fan of Damien, and this even made me miss the little twerp.",
 'dd886dce0714255819a30ec894d29889')