In [1]:
import gzip
import math
import matplotlib.pyplot as plt
import numpy
import random
import sklearn
import string
from collections import defaultdict

from sklearn import linear_model
from sklearn.manifold import TSNE

from pprint import pprint

In [3]:
data = []

with open("finefoods.txt", encoding="latin") as myfile:
    lines = myfile.readlines()
    lines = [line for line in lines]

data_list = []
internal_data = []

In [4]:
for i in lines:
  if i == '\n':
    data_list.append(internal_data)
    internal_data = []
  else:
    internal_data.append(i.strip())

length = defaultdict(int)
columns = ['product/productId', 'review/userId', 'review/profileName', 'review/helpfulness', 'review/score', 'review/time', 'review/summary', 'review/text']
for i in data_list:
  row_dict = defaultdict(str)
  for j in i:
    if j.split(":")[0] in columns:
      row_dict[j.split(":")[0]] += j.split(":")[1]
  data.append(row_dict)

train = int(len(data) * 0.5)
valid = int(len(data) * 0.75)

train_data = data[:train]
valid_data = data[train:valid]
test_data = data[valid:]

In [5]:
unigramWordCount = defaultdict(int)
bigramWordCount = defaultdict(int)
bothWordCount = defaultdict(int)

In [6]:
punctuation = set(string.punctuation)
for d in train_data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])

    unigrams =  r.split()

    for w in unigrams:
        unigramWordCount[w] += 1
        bothWordCount[w] += 1

    for b in list(zip(unigrams[:-1], unigrams[1:])):
        bigram = ' '.join(b)

        bigramWordCount[bigram] += 1
        bothWordCount[bigram] += 1

unigramCounts = [(unigramWordCount[w], w) for w in unigramWordCount]
unigramCounts.sort()
unigramCounts.reverse()

bigramCounts = [(bigramWordCount[w], w) for w in bigramWordCount]
bigramCounts.sort()
bigramCounts.reverse()

bothCounts = [(bothWordCount[w], w) for w in bothWordCount]
bothCounts.sort()
bothCounts.reverse()

In [7]:
def MSE(predictions, labels):
  differences = [(x - y) ** 2 for x, y in zip(predictions, labels)]
  return sum(differences) / len(differences)

unigramWords = [x[1] for x in unigramCounts[:1000]]
unigramWordId = dict(zip(unigramWords, range(len(unigramWords))))
unigramWordSet = set(unigramWords)

bigramWords = [x[1] for x in bigramCounts[:1000]]
bigramWordId = dict(zip(bigramWords, range(len(bigramWords))))
bigramWordSet = set(bigramWords)

bothWords = [x[1] for x in bothCounts[:1000]]
bothWordId = dict(zip(bothWords, range(len(bothWords))))
bothWordSet = set(bothWords)

In [8]:
def feature(datum, words, wordId):
    feat = [0] * len(words)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])

    unigrams = r.split()
    for w in unigrams:
        if w in words:
            feat[wordId[w]] += 1

    for b in list(zip(unigrams[:-1], unigrams[1:])):
        bigram = ' '.join(b)

        if bigram in words:
            feat[wordId[bigram]] += 1

    feat.append(1) # offset
    return feat

In [9]:
# Unigram
X = [feature(d, unigramWords, unigramWordId) for d in train_data]
y = [float(d['review/score']) for d in train_data]

clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_

predictions = clf.predict(X)
print(f'The MSE of the unigram model on training: {MSE(predictions, y)}')

X_valid = [feature(d, unigramWords, unigramWordId) for d in valid_data]
y_valid = [float(d['review/score']) for d in valid_data]

predictions_valid = clf.predict(X_valid)
print(f'The MSE of the unigram model on validation: {MSE(predictions_valid, y_valid)}')

X_test = [feature(d, unigramWords, unigramWordId) for d in test_data]
y_test = [float(d['review/score']) for d in test_data]

predictions_test = clf.predict(X_test)
print(f'The MSE of the unigram model on test: {MSE(predictions_test, y_test)}')

unigramSentiment = list(zip(theta[:-1], unigramWords))
unigramSentiment.sort()
print(f'100 most negative unigrams: ')
pprint(unigramSentiment[:100])
print(f'100 most positive unigrams: ')
pprint(unigramSentiment[-100:])

The MSE of the unigram model on training: 1.066521590180318
The MSE of the unigram model on validation: 1.095352268638909
The MSE of the unigram model on test: 1.0824932197608654
100 most negative unigrams: 
[(-0.8506624657834093, 'awful'),
 (-0.7945820379782581, 'terrible'),
 (-0.7928052315776569, 'horrible'),
 (-0.6359122606676837, 'return'),
 (-0.5659952582749989, 'disappointed'),
 (-0.5246498877323593, 'stale'),
 (-0.5217240792804262, 'unfortunately'),
 (-0.5171274634081543, 'waste'),
 (-0.45484423313957995, 'china'),
 (-0.4381003556323385, 'money'),
 (-0.4215820900076817, 'bland'),
 (-0.4091023564437774, 'hoping'),
 (-0.3740961017111608, 'description'),
 (-0.37102532416275236, 'thinking'),
 (-0.3671808037564827, 'weak'),
 (-0.3622783353990335, 'throw'),
 (-0.34370615200665083, 'changed'),
 (-0.3284030444708304, 'ended'),
 (-0.3237007903981493, 'unless'),
 (-0.2916215622570219, 'excited'),
 (-0.2892988036827173, 'ok'),
 (-0.28880524320237555, 'sick'),
 (-0.28683833480252297, 'avoid

In [11]:
# Bigram
X = [feature(d, bigramWords, bigramWordId) for d in train_data]
y = [float(d['review/score']) for d in train_data]

clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_

predictions = clf.predict(X)
print(f'The MSE of the bigram model on training: {MSE(predictions, y)}')

X_valid = [feature(d, bigramWords, bigramWordId) for d in valid_data]
y_valid = [float(d['review/score']) for d in valid_data]

predictions_valid = clf.predict(X_valid)
print(f'The MSE of the bigram model on validation: {MSE(predictions_valid, y_valid)}')

X_test = [feature(d, bigramWords, bigramWordId) for d in test_data]
y_test = [float(d['review/score']) for d in test_data]

predictions_test = clf.predict(X_test)
print(f'The MSE of the bigram model on test: {MSE(predictions_test, y_test)}')

bigramSentiment = list(zip(theta[:-1], bigramWords))
bigramSentiment.sort()
print(f'100 most negative bigrams: ')
pprint(bigramSentiment[:100])
print(f'100 most positive bigrams: ')
pprint(bigramSentiment[-100:])

The MSE of the bigram model on training: 1.2258086127248826
The MSE of the bigram model on validation: 1.2448124077327882
The MSE of the bigram model on test: 1.2492160807644945
100 most negative bigrams: 
[(-0.7501180765859198, 'wish'),
 (-0.5797350986935143, 'usually'),
 (-0.5766518857410379, 'perhaps'),
 (-0.5092356039918285, 'difficult'),
 (-0.48822452983711384, 'tend'),
 (-0.3814114904561342, 'tastes'),
 (-0.3729295078104443, 'coming'),
 (-0.37281006472156347, 'almonds'),
 (-0.3688825524036575, 'cocoa'),
 (-0.36802004124984444, 'end'),
 (-0.3624981355252347, 'feeling'),
 (-0.35966141541623875, 'grains'),
 (-0.35927164173208764, 'cheap'),
 (-0.35694732834597, 'vet'),
 (-0.35466101617091716, 'order'),
 (-0.3544085250467602, 'reviews'),
 (-0.35423668649902557, 'eaten'),
 (-0.3503514547682615, 'disappointed'),
 (-0.33708949224292234, 'due'),
 (-0.334377493627467, 'same'),
 (-0.32251035403686446, 'ginger'),
 (-0.3129390154865125, 'fish'),
 (-0.3111009965351897, 'immediately'),
 (-0.297

In [12]:
# Both
X = [feature(d, bothWords, bothWordId) for d in train_data]
y = [float(d['review/score']) for d in train_data]

clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_

predictions = clf.predict(X)
print(f'The MSE of the model with both unigram and bigram on training: {MSE(predictions, y)}')

X_valid = [feature(d, bothWords, bothWordId) for d in valid_data]
y_valid = [float(d['review/score']) for d in valid_data]

predictions_valid = clf.predict(X_valid)
print(f'The MSE of the model with both unigram and bigram on validation: {MSE(predictions_valid, y_valid)}')

X_test = [feature(d, bothWords, bothWordId) for d in test_data]
y_test = [float(d['review/score']) for d in test_data]

predictions_test = clf.predict(X_test)
print(f'The MSE of model with both unigram and bigram on test: {MSE(predictions_test, y_test)}')

bothSentiment = list(zip(theta[:-1], bothWords))
bothSentiment.sort()
print(f'100 most negative unigram + bigrams: ')
pprint(bothSentiment[:100])
print(f'100 most positive unigram + bigrams: ')
pprint(bothSentiment[-100:])

The MSE of the model with both unigram and bigram on training: 1.1044519407462328
The MSE of the model with both unigram and bigram on validation: 1.1312849272369578
The MSE of model with both unigram and bigram on test: 1.1189631670151514
100 most negative unigram + bigrams: 
[(-0.5827692538823823, 'disappointed'),
 (-0.5555536965917466, 'money'),
 (-0.392177857103146, 'instead'),
 (-0.35106164628447234, 'will not'),
 (-0.31304553191754214, 'away'),
 (-0.30797817036995945, 'wouldnt'),
 (-0.2945178969088847, 'at all'),
 (-0.2911367289508302, 'ok'),
 (-0.28683490102018866, 'stick'),
 (-0.27241551396381414, 'bad'),
 (-0.23584458292940666, 'maybe'),
 (-0.23579500157085592, 'opened'),
 (-0.23387239653636427, 'guess'),
 (-0.22620696256534475, 'tastes like'),
 (-0.20391143845404297, 'same'),
 (-0.20285310687455574, 'wont'),
 (-0.20122370772557685, 'least'),
 (-0.1940740090997025, 'should'),
 (-0.19362025543516634, 'back to'),
 (-0.18843248911210192, 'cannot'),
 (-0.18714202078419412, 'rather

In [13]:
neg_text = [(-0.8506624657834093, 'awful'),
 (-0.7945820379782581, 'terrible'),
 (-0.7928052315776569, 'horrible'),
 (-0.6359122606676837, 'return'),
 (-0.5659952582749989, 'disappointed'),
 (-0.5246498877323593, 'stale'),
 (-0.5217240792804262, 'unfortunately'),
 (-0.5171274634081543, 'waste'),
 (-0.45484423313957995, 'china'),
 (-0.4381003556323385, 'money'),
 (-0.4215820900076817, 'bland'),
 (-0.4091023564437774, 'hoping'),
 (-0.3740961017111608, 'description'),
 (-0.37102532416275236, 'thinking'),
 (-0.3671808037564827, 'weak'),
 (-0.3622783353990335, 'throw'),
 (-0.34370615200665083, 'changed'),
 (-0.3284030444708304, 'ended'),
 (-0.3237007903981493, 'unless'),
 (-0.2916215622570219, 'excited'),
 (-0.2892988036827173, 'ok'),
 (-0.28880524320237555, 'sick'),
 (-0.28683833480252297, 'avoid'),
 (-0.2840244642974307, 'stick'),
 (-0.2666513639459318, 'wouldnt'),
 (-0.24979631245939396, 'date'),
 (-0.24442827616185228, 'thought'),
 (-0.24413477128266656, 'guess'),
 (-0.2440067951617926, 'away'),
 (-0.23451814471002833, 'touch'),
 (-0.22558558844166338, 'please'),
 (-0.2241152503038634, 'broken'),
 (-0.22334572630386365, 'bad'),
 (-0.21440456396634539, 'idea'),
 (-0.2106685217461198, 'okay'),
 (-0.20484773642968554, 'opened'),
 (-0.20337043220525242, 'mess'),
 (-0.199934545385239, 'maybe'),
 (-0.19887314363992267, 'cannot'),
 (-0.19295553076024685, 'label'),
 (-0.19248206889232128, 'rest'),
 (-0.19246541273408424, 'perhaps'),
 (-0.19238028245975755, 'rather'),
 (-0.18344807519408277, 'star'),
 (-0.18126606944897242, 'contains'),
 (-0.17954652938142324, 'ingredient'),
 (-0.17648262420089728, 'received'),
 (-0.16498642666686897, 'supposed'),
 (-0.16473157182028328, 'wont'),
 (-0.16282822873858493, 'gave'),
 (-0.15946925204831572, 'should'),
 (-0.15790172036811317, 'however'),
 (-0.15785659754778447, 'customer'),
 (-0.15620606240964927, 'otherwise'),
 (-0.15356435243910654, 'item'),
 (-0.15279439439091108, 'feeding'),
 (-0.15271170404801448, 'not'),
 (-0.1485572132948373, 'couldnt'),
 (-0.14842102083577324, 'finish'),
 (-0.14809595185489333, 'company'),
 (-0.14728861022530643, 'tasted'),
 (-0.1448407937580905, 'old'),
 (-0.1447275712823901, 'bought'),
 (-0.14406734043744435, 'batch'),
 (-0.14377709758095242, 'didnt'),
 (-0.14222807723156652, 'nothing'),
 (-0.14138972091555146, 'expected'),
 (-0.13966848004446597, 'aftertaste'),
 (-0.1362465356379665, 'smells'),
 (-0.13520153636030519, 'called'),
 (-0.13486324145400486, '50'),
 (-0.1326314499865796, 'why'),
 (-0.13256685449456274, 'decent'),
 (-0.13154403047449192, 'off'),
 (-0.1294908954940671, 'k'),
 (-0.12919872926722578, 'sent'),
 (-0.1284957249473008, 'purchased'),
 (-0.12802844424400084, 'mouth'),
 (-0.12653201264415379, 'tiny'),
 (-0.12601096364614003, 'extremely'),
 (-0.12355569279083957, 'website'),
 (-0.12222459597741923, 'expecting'),
 (-0.12194260425742191, 'wanted'),
 (-0.11987398513079937, 'cans'),
 (-0.11613702106741439, 'artificial'),
 (-0.1157559739601642, 'someone'),
 (-0.11401456929906599, 'ill'),
 (-0.11138092732310766, 'bite'),
 (-0.10776591818838185, 'salty'),
 (-0.10732705557789861, 'ordered'),
 (-0.10699468642448229, 'instant'),
 (-0.10652715762251345, 'instead'),
 (-0.10466853518302072, 'brewing'),
 (-0.10462793859901663, 'opinion'),
 (-0.10328074823416165, 'switch'),
 (-0.10239130155228597, 'products'),
 (-0.10183757219539082, 'leaves'),
 (-0.1018188361244204, 'sour'),
 (-0.10167892849671809, 'left'),
 (-0.10141875043824428, 'check')]

neg_text_words = [x[1] for x in neg_text]
neg_text_wordId = dict(zip(neg_text_words, range(len(neg_text_words))))

In [14]:
pos_text = [(0.08946812491980566, 'convenient'),
 (0.0901851306102546, 'fun'),
 (0.09183225022252016, 'condition'),
 (0.09264356454462333, 'needed'),
 (0.09301299272146069, 'day'),
 (0.09321535851678167, 'finding'),
 (0.09414475166414032, 'myself'),
 (0.0947801314274065, 'tell'),
 (0.09492481284643108, 'benefits'),
 (0.09730156090338063, 'everything'),
 (0.09815154332890606, 'locally'),
 (0.09841139665093362, 'packaged'),
 (0.09899844240084611, 'find'),
 (0.09924542168308505, 'satisfying'),
 (0.09984399800301887, 'remember'),
 (0.0999218899966003, 'soon'),
 (0.10288510299722599, 'become'),
 (0.10295266240199105, 'liked'),
 (0.10395331651173154, 'without'),
 (0.10421408446187741, 'enjoyed'),
 (0.10526704540311128, 'always'),
 (0.1054298217169893, 'wow'),
 (0.10555145683446521, 'bulk'),
 (0.10707781899544795, 'worked'),
 (0.11235449004087242, 'christmas'),
 (0.11247340842072705, 'yes'),
 (0.1137376659853925, 'plan'),
 (0.11401707283452328, 'works'),
 (0.1154091205383399, 'good'),
 (0.11782996475156722, 'able'),
 (0.11819294282406427, 'hands'),
 (0.1200807302390082, 'anyway'),
 (0.12436534504199688, 'difference'),
 (0.12455635722245124, 'fresh'),
 (0.12628869993945904, 'years'),
 (0.127508904367599, 'issues'),
 (0.12872038675628694, 'healthier'),
 (0.12932519541049387, 'anywhere'),
 (0.13035810641090045, 'crazy'),
 (0.1313158342485924, 'note'),
 (0.1328029698456841, 'ready'),
 (0.13440503721340633, 'moist'),
 (0.1351770076826867, 'delivered'),
 (0.13683889999371068, 'continue'),
 (0.1386143644344962, 'keeps'),
 (0.14045072840072229, 'quickly'),
 (0.14049129535976823, 'tasty'),
 (0.14205217806161685, 'pricey'),
 (0.1424725831507062, 'right'),
 (0.14255980413686373, 'run'),
 (0.14268203234291132, 'easy'),
 (0.14335109366925272, 'future'),
 (0.14341849639119797, 'friends'),
 (0.14456894174401747, 'carry'),
 (0.14606739840614186, 'flavorful'),
 (0.14816132653559091, 'rich'),
 (0.15262253430255474, 'helps'),
 (0.15993183738415068, 'fast'),
 (0.16119726056085662, 'nice'),
 (0.16125564805869913, 'pleasant'),
 (0.1631523760883954, 'definitely'),
 (0.1645744898732084, 'unlike'),
 (0.16465558961004625, 'everyone'),
 (0.16480811072810453, 'impressed'),
 (0.16529629968215825, 'stores'),
 (0.166060673212737, 'reasonable'),
 (0.1660626079516025, 'subtle'),
 (0.16617703669360284, 'loved'),
 (0.16895166166358674, 'picky'),
 (0.17021048391078203, 'love'),
 (0.17072561312516177, 'youll'),
 (0.19262216213199249, 'favorite'),
 (0.19980021597698636, 'happy'),
 (0.20052870771058148, 'easier'),
 (0.20680544816103094, 'hit'),
 (0.21012975489778485, 'smooth'),
 (0.22099081442503396, 'thanks'),
 (0.22335017453148165, 'surprised'),
 (0.2252135954327306, 'wait'),
 (0.22948596922365405, 'refreshing'),
 (0.23550963357017834, 'yummy'),
 (0.24497817749447184, 'loves'),
 (0.24579664586058586, 'thank'),
 (0.24671423550212943, 'perfectly'),
 (0.2566643615749681, 'great'),
 (0.2836664343228445, 'wonderful'),
 (0.2887067469591782, 'highly'),
 (0.29320205758326656, 'perfect'),
 (0.29709634055255096, 'fantastic'),
 (0.298501631193101, 'glad'),
 (0.300798801694409, 'best'),
 (0.3064282320321241, 'exactly'),
 (0.3109836834531771, 'amazing'),
 (0.3134925637261532, 'delicious'),
 (0.3349412421713682, 'excellent'),
 (0.3367320685692052, 'satisfied'),
 (0.3452045085725943, 'awesome'),
 (0.34808942593142, 'beat'),
 (0.35813171145083833, 'hooked'),
 (0.36187356971504525, 'pleased')]

pos_text_words = [x[1] for x in pos_text]
pos_text_wordId = dict(zip(pos_text_words, range(len(pos_text_words))))

In [15]:
neg_summary = [(-2.552097154267836, 'worst'),
 (-2.4704706698398717, 'disgusting'),
 (-2.2970872574982106, 'awful'),
 (-2.256396597110124, 'disappointment'),
 (-2.1908153143084297, 'yuck'),
 (-2.1497838737880235, 'horrible'),
 (-2.1457010417311806, 'nasty'),
 (-2.130588527365515, 'terrible'),
 (-2.0842687272857447, 'dangerous'),
 (-2.0767913173789827, 'tasteless'),
 (-2.0167959756373044, 'disappointed'),
 (-1.974966036608686, 'disappointing'),
 (-1.9245173125014479, 'gross'),
 (-1.8647265361031133, 'sick'),
 (-1.7984602097324063, 'poor'),
 (-1.7841263655212163, 'expired'),
 (-1.7790426859962558, 'rip'),
 (-1.7645663997776173, 'stale'),
 (-1.6974157737924318, 'wheres'),
 (-1.6929294004074023, 'melted'),
 (-1.6099675136143, 'misleading'),
 (-1.5691456207418149, 'false'),
 (-1.5292992632016507, 'china'),
 (-1.4852279561171295, 'beware'),
 (-1.4633448286251425, 'didnt'),
 (-1.4220690246484702, 'broken'),
 (-1.3730242941160327, 'warning'),
 (-1.3592475181689607, 'mediocre'),
 (-1.315000393715326, 'junk'),
 (-1.307683296778811, 'meh'),
 (-1.2941612760562666, 'contains'),
 (-1.2816692204866205, 'bland'),
 (-1.2685350637136836, 'weak'),
 (-1.2619418879292659, 'damaged'),
 (-1.2512287954719077, 'filler'),
 (-1.2272089062883245, 'strange'),
 (-1.2248343180372667, 'empty'),
 (-1.192578509735641, 'plastic'),
 (-1.1856621346754257, 'overpriced'),
 (-1.1751030258388502, 'batch'),
 (-1.1627499614241124, 'not'),
 (-1.154627096097227, 'waste'),
 (-1.1489254063921956, 'soso'),
 (-1.1134886192045408, 'msg'),
 (-1.106327512252196, 'dented'),
 (-1.095845427335176, 'wouldnt'),
 (-1.059363942356808, 'changed'),
 (-1.0427484462091587, 'hate'),
 (-1.0391175427933372, 'watery'),
 (-1.0322612721653757, 'weird'),
 (-1.0279238184664594, 'careful'),
 (-1.0270992674548791, 'wrong'),
 (-1.0145307635113558, 'artificial'),
 (-0.9570261346412929, 'dont'),
 (-0.9534131248805753, 'fake'),
 (-0.9249260370651451, 'doesnt'),
 (-0.9019780859659494, 'away'),
 (-0.9006599276798379, 'wont'),
 (-0.8802065452622888, 'odd'),
 (-0.8740182780768699, 'gave'),
 (-0.8306259454910659, 'edible'),
 (-0.7932755073616592, 'zero'),
 (-0.7914349685681973, 'open'),
 (-0.782511799079603, 'instead'),
 (-0.770370375616778, 'nothing'),
 (-0.767897951692879, 'touch'),
 (-0.7504759941679131, 'received'),
 (-0.7347116997227175, 'short'),
 (-0.733193685321928, 'okay'),
 (-0.7256780431462242, 'salty'),
 (-0.7222787150889306, 'idea'),
 (-0.7195797439904968, 'messy'),
 (-0.7143115287715289, 'advertising'),
 (-0.7139427253975696, 'off'),
 (-0.7124935556087938, 'check'),
 (-0.7012793929624622, 'ok'),
 (-0.6975633504753553, 'leaves'),
 (-0.6926672270528664, 'bring'),
 (-0.6921241142607881, 'average'),
 (-0.6910530254705499, 'tough'),
 (-0.6773313088891505, 'bitter'),
 (-0.6756232588962606, 'missing'),
 (-0.6717617898845725, 'switch'),
 (-0.6510380659802236, 'description'),
 (-0.645727156160756, 'stinky'),
 (-0.6385832099375672, 'date'),
 (-0.6226315194095677, 'least'),
 (-0.6215675124646728, 'wanted'),
 (-0.6212363176953066, 'watch'),
 (-0.6190198919205934, 'control'),
 (-0.6096464674359205, 'aftertaste'),
 (-0.6033614548319146, 'before'),
 (-0.6025668840841893, 'expensive'),
 (-0.6002836360718066, 'ingredients'),
 (-0.5965845508036008, 'mess'),
 (-0.5945171851729958, 'couldnt'),
 (-0.593979393952705, 'pay'),
 (-0.5886507589756871, 'isnt'),
 (-0.5791129649436029, 'longer'),
 (-0.5706178714975829, 'down')]

neg_summary_words = [x[1] for x in neg_summary]
neg_summary_wordId = dict(zip(neg_summary_words, range(len(neg_summary_words))))

In [16]:
pos_summary = [(0.4117241840508589, 'enjoyable'),
 (0.4127168472622979, 'daily'),
 (0.414740569438855, 'stores'),
 (0.421110844498426, 'versatile'),
 (0.4236628881545088, 'super'),
 (0.4270508803368187, 'true'),
 (0.4302308519008908, 'must'),
 (0.4308341962127311, 'find'),
 (0.4313406430731228, 'usa'),
 (0.43283709587909935, 'greenies'),
 (0.43326787543892165, 'unique'),
 (0.4344513957401311, 'life'),
 (0.4380091821135914, 'works'),
 (0.4403767737688451, 'satisfying'),
 (0.4449632018574562, 'effective'),
 (0.4501371649978727, 'expect'),
 (0.45175632993499354, 'tasty'),
 (0.45275567336778916, 'bran'),
 (0.4585244220634663, 'drinker'),
 (0.46594161649985943, 'refreshing'),
 (0.4716461439440986, 'handy'),
 (0.4725443684404369, 'five'),
 (0.47687475905854143, 'loved'),
 (0.48543508189364276, 'keeps'),
 (0.49181752792011785, 'mmmm'),
 (0.4929006100396407, 'homemade'),
 (0.4932249562580302, 'smooth'),
 (0.49553857667634604, 'zukes'),
 (0.4977045005045112, 'crystal'),
 (0.5030008325732109, 'perfectly'),
 (0.5086872753059851, 'saver'),
 (0.5197542765480924, 'goodness'),
 (0.5222804583827481, 'lover'),
 (0.5241036997707729, 'years'),
 (0.5407289760706604, 'top'),
 (0.5407585320037162, 'bulk'),
 (0.5430064016303735, 'memories'),
 (0.5434807159332935, 'paws'),
 (0.5436945387828828, 'difference'),
 (0.5494622768806274, 'helps'),
 (0.5510598235867902, 'classic'),
 (0.5519587526487473, 'without'),
 (0.5545687014788604, 'soothing'),
 (0.5552164853450056, 'home'),
 (0.5555738538940305, 'nutiva'),
 (0.5640393152364213, 'wow'),
 (0.5659219241781156, 'pleasantly'),
 (0.5751034835196445, 'rocks'),
 (0.5819345054305314, 'happy'),
 (0.5847653008318618, 'hit'),
 (0.587066596741028, 'dream'),
 (0.590421877597505, 'staple'),
 (0.5961033689686511, 'mmmmm'),
 (0.5986005956495387, 'glad'),
 (0.6012140580509561, 'love'),
 (0.6127486909494732, 'loves'),
 (0.6180764785056787, 'yum'),
 (0.619629643969523, 'delight'),
 (0.6342552811547707, 'hands'),
 (0.6379898931628903, 'magic'),
 (0.6410938853920092, 'satisfied'),
 (0.6470785205484207, 'lovely'),
 (0.6612955261566108, 'stop'),
 (0.666081830941371, 'beautiful'),
 (0.6816778637480024, 'calm'),
 (0.6867097190080991, 'crack'),
 (0.6887181402503102, 'finally'),
 (0.6939574602310205, 'helped'),
 (0.6960290423287503, 'approved'),
 (0.7013377968653721, 'pleased'),
 (0.7126602655538496, 'winner'),
 (0.7134628767333722, 'favorite'),
 (0.7140588520547229, 'yummy'),
 (0.7245294576970404, 'beat'),
 (0.7264956467138625, 'great'),
 (0.7326376578277495, 'addicted'),
 (0.7438921326003072, 'addictive'),
 (0.7645396844393818, 'favorites'),
 (0.7652723140837316, 'perfect'),
 (0.7687845933768193, 'terrific'),
 (0.7790393174332657, 'superior'),
 (0.7899479486213603, 'delightful'),
 (0.79333915984707, 'delicious'),
 (0.7987454520545216, 'greatest'),
 (0.799646764551584, 'best'),
 (0.8203712379887839, 'wonderful'),
 (0.8256225081253383, 'incredible'),
 (0.8288102951184073, 'excellent'),
 (0.8459818953481701, 'exactly'),
 (0.8565905598371653, 'hooked'),
 (0.8651795810606647, 'amazing'),
 (0.8779203539424978, 'fantastic'),
 (0.8787898613710655, 'delish'),
 (0.880251847395135, 'superb'),
 (0.8825771710584981, 'awesome'),
 (0.8895386511014413, 'addicting'),
 (0.8934181270971598, 'fabulous'),
 (0.9119113676234888, 'perfection'),
 (0.9124099106159597, 'outstanding'),
 (0.9436851544036693, 'heaven')]

pos_summary_words = [x[1] for x in pos_summary]
pos_summary_wordId = dict(zip(pos_summary_words, range(len(pos_summary_words))))

In [26]:
def find_ngram(text, pos, neg, pos_wordId, neg_wordId):
  bag = [0] * (len(neg) + len(pos))

  unigrams = text.split()
  for w in unigrams:
    if w in neg:
      bag[neg_wordId[w]] += 1
    
    if w in pos:
      bag[len(neg) + pos_wordId[w]] += 1
  
  return bag

def new_feature(datum):
    feat = [1] 

    t = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    feat += find_ngram(t, pos_text_words, neg_text_words, pos_text_wordId, neg_text_wordId)
    
    s = ''.join([c for c in datum['review/summary'].lower() if not c in punctuation])
    feat += find_ngram(s, pos_summary_words, neg_summary_words, pos_summary_wordId, neg_summary_wordId)

    return feat

def new_feature_no_summary(datum):
    feat = [1] 

    t = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    feat += find_ngram(t, pos_text_words, neg_text_words, pos_text_wordId, neg_text_wordId)
    return feat

def new_feature_no_text(datum):
    feat = [1] 

    s = ''.join([c for c in datum['review/summary'].lower() if not c in punctuation])
    feat += find_ngram(s, pos_summary_words, neg_summary_words, pos_summary_wordId, neg_summary_wordId)
    return feat

In [20]:
# New Feature w/ Text And w/o Summary
X = [new_feature_no_summary(d) for d in train_data]
y = [float(d['review/score']) for d in train_data]

clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_

predictions = clf.predict(X)
print(f'The MSE of the new model with both Text mining on training: {MSE(predictions, y)}')

X_valid = [new_feature_no_summary(d) for d in valid_data]
y_valid = [float(d['review/score']) for d in valid_data]

predictions_valid = clf.predict(X_valid)
print(f'The MSE of the new model with both Text mining on validation: {MSE(predictions_valid, y_valid)}')

X_test = [new_feature_no_summary(d) for d in test_data]
y_test = [float(d['review/score']) for d in test_data]

predictions_test = clf.predict(X_test)
print(f'The MSE of the new model with both Text mining on test: {MSE(predictions_test, y_test)}')

The MSE of the new model with both Text mining on training: 1.148433013739955
The MSE of the new model with both Text mining on validation: 1.173794618316245
The MSE of the new model with both Text mining on test: 1.1644870705603934


In [28]:
# New Feature w/o Text And w/ Summary
X = [new_feature_no_text(d) for d in train_data]
y = [float(d['review/score']) for d in train_data]

clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_

predictions = clf.predict(X)
print(f'The MSE of the new model with Summary mining on training: {MSE(predictions, y)}')

X_valid = [new_feature_no_text(d) for d in valid_data]
y_valid = [float(d['review/score']) for d in valid_data]

predictions_valid = clf.predict(X_valid)
print(f'The MSE of the new model with Summary mining on validation: {MSE(predictions_valid, y_valid)}')

X_test = [new_feature_no_text(d) for d in test_data]
y_test = [float(d['review/score']) for d in test_data]

predictions_test = clf.predict(X_test)
print(f'The MSE of the new model with Summary mining on test: {MSE(predictions_test, y_test)}')

The MSE of the new model with Summary mining on training: 1.0838806790803213
The MSE of the new model with Summary mining on validation: 1.0970519799513556
The MSE of the new model with Summary mining on test: 1.0970493380687834


In [22]:
# New Feature w/ Text And Summary
X = [new_feature(d) for d in train_data]
y = [float(d['review/score']) for d in train_data]

clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_

predictions = clf.predict(X)
print(f'The MSE of the new model with both Text and Summary mining on training: {MSE(predictions, y)}')

X_valid = [new_feature(d) for d in valid_data]
y_valid = [float(d['review/score']) for d in valid_data]

predictions_valid = clf.predict(X_valid)
print(f'The MSE of the new model with both Text and Summary mining on validation: {MSE(predictions_valid, y_valid)}')

X_test = [new_feature(d) for d in test_data]
y_test = [float(d['review/score']) for d in test_data]

predictions_test = clf.predict(X_test)
print(f'The MSE of the new model with both Text and Summary mining on test: {MSE(predictions_test, y_test)}')

The MSE of the new model with both Text and Summary mining on training: 0.8829975830794127
The MSE of the new model with both Text and Summary mining on validation: 0.9015895336341492
The MSE of the new model with both Text and Summary mining on test: 0.8952940661849451


In [25]:
unigramWords200 = [x[1] for x in unigramCounts[:200]]
unigramWordId200 = dict(zip(unigramWords, range(len(unigramWords))))
unigramWordSet200 = set(unigramWords)

# Unigram 200
X = [feature(d, unigramWords200, unigramWordId200) for d in train_data]
y = [float(d['review/score']) for d in train_data]

clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_

predictions = clf.predict(X)
print(f'The MSE of the unigram 200 model on training: {MSE(predictions, y)}')

X_valid = [feature(d, unigramWords200, unigramWordId200) for d in valid_data]
y_valid = [float(d['review/score']) for d in valid_data]

predictions_valid = clf.predict(X_valid)
print(f'The MSE of the unigram 200 model on validation: {MSE(predictions_valid, y_valid)}')

X_test = [feature(d, unigramWords200, unigramWordId200) for d in test_data]
y_test = [float(d['review/score']) for d in test_data]

predictions_test = clf.predict(X_test)
print(f'The MSE of the unigram 200 model on test: {MSE(predictions_test, y_test)}')


The MSE of the unigram 200 model on training: 1.3220192627153153
The MSE of the unigram 200 model on validation: 1.3374376780449913
The MSE of the unigram 200 model on test: 1.3297725042005872


In [29]:
# Linear Regression
X = [new_feature(d) for d in train_data]
y = [float(d['review/score']) for d in train_data]

clf = linear_model.LinearRegression()
clf.fit(X, y)
theta = clf.coef_

predictions = clf.predict(X)
print(f'The MSE of the best model with Linear Regression on training: {MSE(predictions, y)}')

X_valid = [new_feature(d) for d in valid_data]
y_valid = [float(d['review/score']) for d in valid_data]

predictions_valid = clf.predict(X_valid)
print(f'The MSE of the best model with Linear Regression on validation: {MSE(predictions_valid, y_valid)}')

X_test = [new_feature(d) for d in test_data]
y_test = [float(d['review/score']) for d in test_data]

predictions_test = clf.predict(X_test)
print(f'The MSE of the best model with Linear Regression on test: {MSE(predictions_test, y_test)}')

The MSE of the best model with Linear Regression on training: 0.8829952469961759
The MSE of the best model with Linear Regression on validation: 0.901599790092361
The MSE of the best model with Linear Regression on test: 0.8952994143797629


In [31]:
# Trying different constants for regularization strength
reg_consts = [0.01, 0.1, 1, 10, 100, 1000]

best_reg = None
for r in reg_consts:
  # New Feature w/ Text And Summary
  X = [new_feature(d) for d in train_data]
  y = [float(d['review/score']) for d in train_data]

  clf = linear_model.Ridge(r, fit_intercept=False) # MSE + 1.0 l2
  clf.fit(X, y)
  theta = clf.coef_

  X_valid = [new_feature(d) for d in valid_data]
  y_valid = [float(d['review/score']) for d in valid_data]

  predictions_valid = clf.predict(X_valid)
  print(f'The MSE of the best model with ridge regression const {r} on validation: {MSE(predictions_valid, y_valid)}')

The MSE of the best model with ridge regression const 0.01 on validation: 0.9015990585286275
The MSE of the best model with ridge regression const 0.1 on validation: 0.9015979249228075
The MSE of the best model with ridge regression const 1 on validation: 0.9015895336341492
The MSE of the best model with ridge regression const 10 on validation: 0.9017615333965223
The MSE of the best model with ridge regression const 100 on validation: 0.9127789457201247
The MSE of the best model with ridge regression const 1000 on validation: 0.9928541131826174


In [32]:
# Both With Capped Values, Rounded Values, And Both Capped and Rounded Values
X = [new_feature(d) for d in train_data]
y = [float(d['review/score']) for d in train_data]

clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_

predictions = clf.predict(X)

X_valid = [new_feature(d) for d in valid_data]
y_valid = [float(d['review/score']) for d in valid_data]

predictions_valid = clf.predict(X_valid)

X_test = [new_feature(d) for d in test_data]
y_test = [float(d['review/score']) for d in test_data]

predictions_test = clf.predict(X_test)

def Cap(n):
    if n > 5:
      return 5
    if n < 1:
      return 1
    return n

def Round(n):
    return round(n)

capped_predict = list(map(Cap, predictions))
capped_predict_valid = list(map(Cap, predictions_valid))
capped_predict_test = list(map(Cap, predictions_test))

rounded_predict = list(map(Round, predictions))
rounded_predict_valid = list(map(Round, predictions_valid))
rounded_predict_test = list(map(Round, predictions_test))

capped_and_rounded_predict = list(map(Round, capped_predict))
capped_and_rounded_predict_valid = list(map(Round, capped_predict_valid))
capped_and_rounded_predict_test = list(map(Round, capped_predict_test))

print(f'The MSE of the model using capped train: {MSE(capped_predict, y)}')
print(f'The MSE of the model using capped valid: {MSE(capped_predict_valid, y_valid)}')
print(f'The MSE of the model using capped test: {MSE(capped_predict_test, y_test)}')

print(f'The MSE of the model using rounded train: {MSE(rounded_predict, y)}')
print(f'The MSE of the model using rounded valid: {MSE(rounded_predict_valid, y_valid)}')
print(f'The MSE of the model using rounded test: {MSE(rounded_predict_test, y_test)}')

print(f'The MSE of the model using capped and rounded train: {MSE(capped_and_rounded_predict, y)}')
print(f'The MSE of the model using capped and rounded valid: {MSE(capped_and_rounded_predict_valid, y_valid)}')
print(f'The MSE of the model using capped and rounded test: {MSE(capped_and_rounded_predict_test, y_test)}')

The MSE of the model using capped train: 0.8480305811078144
The MSE of the model using capped valid: 0.8666464596108248
The MSE of the model using capped test: 0.8608320287972824
The MSE of the model using rounded train: 0.974899734738405
The MSE of the model using rounded valid: 0.9969399856957911
The MSE of the model using rounded test: 0.9941334637008067
The MSE of the model using capped and rounded train: 0.9388947735317816
The MSE of the model using capped and rounded valid: 0.9602831871225906
The MSE of the model using capped and rounded test: 0.957938383262265


In [39]:
# With 5 Classifiers
X = [new_feature(d) for d in train_data]
y_1 = [int(float(d['review/score'])) == 1 for d in train_data]
y_2 = [int(float(d['review/score'])) == 2 for d in train_data]
y_3 = [int(float(d['review/score'])) == 3 for d in train_data]
y_4 = [int(float(d['review/score'])) == 4 for d in train_data]
y_5 = [int(float(d['review/score'])) == 5 for d in train_data]

mod_1 = linear_model.LogisticRegression()
mod_1.fit(X, y_1)

mod_2 = linear_model.LogisticRegression()
mod_2.fit(X, y_2)

mod_3 = linear_model.LogisticRegression()
mod_3.fit(X, y_3)

mod_4 = linear_model.LogisticRegression()
mod_4.fit(X, y_4)

mod_5 = linear_model.LogisticRegression()
mod_5.fit(X, y_5)

prediction = [
              mod_1.predict_proba(X)[:,1], 
              mod_2.predict_proba(X)[:,1],
              mod_3.predict_proba(X)[:,1],
              mod_4.predict_proba(X)[:,1],
              mod_5.predict_proba(X)[:,1]
]

X_valid = [new_feature(d) for d in valid_data]
y_valid = [float(d['review/score']) for d in valid_data]

prediction_valid = [
              mod_1.predict_proba(X_valid)[:,1], 
              mod_2.predict_proba(X_valid)[:,1],
              mod_3.predict_proba(X_valid)[:,1],
              mod_4.predict_proba(X_valid)[:,1],
              mod_5.predict_proba(X_valid)[:,1]
]

X_test = [new_feature(d) for d in test_data]
y_test = [float(d['review/score']) for d in test_data]

prediction_test = [
              mod_1.predict_proba(X_test)[:,1], 
              mod_2.predict_proba(X_test)[:,1],
              mod_3.predict_proba(X_test)[:,1],
              mod_4.predict_proba(X_test)[:,1],
              mod_5.predict_proba(X_test)[:,1]
]

final_pred = [0] * len(y)
final_pred_valid = [0] * len(y_valid)
final_pred_test = [0] * len(y_test)

for i in range(len(y)):
  current_max = (1, prediction[0][i])
  for j in range(len(prediction)):
    if prediction[j][i] > current_max[1]:
      current_max = (j+1, prediction[j][i])
  final_pred[i] = current_max[0]

for i in range(len(y_valid)):
  current_max = (1, prediction_valid[0][i])
  for j in range(len(prediction_valid)):
    if prediction_valid[j][i] > current_max[1]:
      current_max = (j+1, prediction_valid[j][i])
  final_pred_valid[i] = current_max[0]

for i in range(len(y_test)):
  current_max = (1, prediction_test[0][i])
  for j in range(len(prediction_test)):
    if prediction_test[j][i] > current_max[1]:
      current_max = (j+1, prediction_test[j][i])
  final_pred_test[i] = current_max[0]

print(f'The MSE of the 5 classifier model is train: {MSE(final_pred, y)}')
print(f'The MSE of the 5 classifier model is valid: {MSE(final_pred_valid, y_valid)}')
print(f'The MSE of the 5 classifier model is test: {MSE(final_pred_test, y_test)}')

KeyboardInterrupt: ignored

In [49]:
# Old Baseline on Test
X = [[1, len(d['review/text'])] for d in train_data]
y = [float(d['review/score']) for d in train_data]

mod = linear_model.LinearRegression()
mod.fit(X, y)

def MSE(predictions, labels):
  differences = [(x - y) ** 2 for x, y in zip(predictions, labels)]
  return sum(differences) / len(differences)

X_valid = [[1, len(d['review/text'])] for d in valid_data]
y_valid = [float(d['review/score']) for d in valid_data]

X_test = [[1, len(d['review/text'])] for d in test_data]
y_test = [float(d['review/score']) for d in test_data]

predictions = mod.predict(X)
predictions_valid = mod.predict(X_valid)
predictions_test = mod.predict(X_test)
print(f'The Baseline Model MSE is train: {MSE(predictions, y)}')
print(f'The Baseline Model MSE is valid: {MSE(predictions_valid, y_valid)}')
print(f'The Baseline Model MSE is test: {MSE(predictions_test, y_test)}')

The Baseline Model MSE is train: 1.7060384205759391
The Baseline Model MSE is valid: 1.7306404169234044
The Baseline Model MSE is test: 1.7276612907564608


In [48]:
# New Baseline on Test
glb_mean = sum(float(d['review/score']) for d in train_data) / len(train_data)
print(glb_mean)

y = [float(d['review/score']) for d in train_data]
y_valid = [float(d['review/score']) for d in valid_data]
y_test = [float(d['review/score']) for d in test_data]

print(f'The Baseline Model MSE is train: {MSE([glb_mean] * len(y), y)}')
print(f'The Baseline Model MSE is valid: {MSE([glb_mean] * len(y_valid), y_valid)}')
print(f'The Baseline Model MSE is test: {MSE([glb_mean] * len(y_test), y_test)}')

4.175339724960844
The Baseline Model MSE is train: 1.7178142433412389
The Baseline Model MSE is valid: 1.739734418677278
The Baseline Model MSE is test: 1.738211814923221


In [46]:
print(len(train_data))
print(len(valid_data))
print(len(test_data))

220914
110457
110457
