In [1]:
import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict
import nltk # natural Language toolkit
import string # various string operations
from nltk.stem.porter import * # stemmer
from sklearn import linear_model
import ast
import gensim 
from gensim.models import Word2Vec # Language model (probably in a couple of weeks...)

In [2]:
def parseData(fname):
    for l in urllib.urlopen(fname):
        yield ast.literal_eval(l)

In [3]:
def parseDataFromFile(fname):
    for l in open(fname):
        yield ast.literal_eval(l)

In [7]:
data_ = list(parseDataFromFile("/Users/youzezheng/Desktop/CSE158-UCSD/input/beer_50000.json"))

In [8]:
data = data_[:5000]

In [9]:
data[0]

{'review/appearance': 2.5,
 'beer/style': 'Hefeweizen',
 'review/palate': 1.5,
 'review/taste': 1.5,
 'beer/name': 'Sausa Weizen',
 'review/timeUnix': 1234817823,
 'beer/ABV': 5.0,
 'beer/beerId': '47986',
 'beer/brewerId': '10325',
 'review/timeStruct': {'isdst': 0,
  'mday': 16,
  'hour': 20,
  'min': 57,
  'sec': 3,
  'mon': 2,
  'year': 2009,
  'yday': 47,
  'wday': 0},
 'review/overall': 1.5,
 'review/text': 'A lot of foam. But a lot.\tIn the smell some banana, and then lactic and tart. Not a good start.\tQuite dark orange in color, with a lively carbonation (now visible, under the foam).\tAgain tending to lactic sourness.\tSame for the taste. With some yeast and banana.',
 'user/profileName': 'stcules',
 'review/aroma': 2.0}

In [10]:
# Count words
wordCount = defaultdict(int)
totalWords = 0

In [11]:
punct = string.punctuation
stemmer = PorterStemmer()

In [12]:
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
data[0]['review/text'].split() # separated by space/tab/newLine characters

['A',
 'lot',
 'of',
 'foam.',
 'But',
 'a',
 'lot.',
 'In',
 'the',
 'smell',
 'some',
 'banana,',
 'and',
 'then',
 'lactic',
 'and',
 'tart.',
 'Not',
 'a',
 'good',
 'start.',
 'Quite',
 'dark',
 'orange',
 'in',
 'color,',
 'with',
 'a',
 'lively',
 'carbonation',
 '(now',
 'visible,',
 'under',
 'the',
 'foam).',
 'Again',
 'tending',
 'to',
 'lactic',
 'sourness.',
 'Same',
 'for',
 'the',
 'taste.',
 'With',
 'some',
 'yeast',
 'and',
 'banana.']

### 1. How many words are there? ~36K

In [15]:
for d in data:
    r = d['review/text']
    words = r.split()
    for w in words:
        totalWords += 1
        wordCount[w] += 1

In [16]:
totalWords

640169

This is saying we have roughly 100 words per review on average.

In [18]:
len(wordCount) # 36K unique words

36225

Every review we saw, we have 7-8 new words we have never seen before.

In [20]:
wordCount['the'] # across 5000 reviews

22439

In [22]:
wordCount['The']

5081

In [23]:
wordCount['great']

800

In [24]:
wordCount['the?']

0

### 2. What if we remove capitalization/punctuation? ~19K

In [27]:
# Count words
wordCount = defaultdict(int)
totalWords = 0

In [28]:
for d in data:
    r = d['review/text']
    r = r.lower()
    r = ''.join(c for c in r if (c not in punct))
    words = r.split()
    for w in words:
        totalWords += 1
        wordCount[w] += 1

In [29]:
totalWords

636392

In [30]:
len(wordCount)

19426

### 3. What if we merge different inflections of words? ~15K

* drinks   -> drink
* drinking -> drink
* drinker  -> drink

* argue   -> argu
* arguing -> argu
* argues  -> argu
* arguing -> argu
* argus   -> argu

In [31]:
# Count words
wordCount = defaultdict(int)
totalWords = 0

In [32]:
for d in data:
    r = d['review/text']
    r = r.lower()
    r = ''.join(c for c in r if (c not in punct))
    words = r.split()
    for w in words:
        w = stemmer.stem(w)
        totalWords += 1
        wordCount[w] += 1

In [33]:
totalWords

636392

In [34]:
len(wordCount)

14847

* Stemming is **critical** for retrieval-type applications (e.g. we want Google to return pages with the word "cat" when we search for "cats")
* Words like "waste" and "wasted" may have different meanings (in beer reviews), and we're throwing that away by stemming.

### 4. Just discard extremely rare words...
* pretty unsatisfying but at least we can get to some inference now!

In [35]:
# Get back to the second technique without stemmer

# Count words
wordCount = defaultdict(int)
totalWords = 0

for d in data:
    r = d['review/text']
    r = r.lower()
    r = ''.join(c for c in r if (c not in punct))
    words = r.split()
    for w in words:
        totalWords += 1
        wordCount[w] += 1

In [36]:
# Build vector of word counts
counts = [(wordCount[w], w) for w in wordCount]

In [37]:
counts[:10] # look at data structure

[(30695, 'a'),
 (556, 'lot'),
 (15935, 'of'),
 (389, 'foam'),
 (6836, 'but'),
 (7033, 'in'),
 (27569, 'the'),
 (1771, 'smell'),
 (3946, 'some'),
 (151, 'banana')]

In [38]:
counts.sort(reverse=True)

In [39]:
counts[:10]

[(30695, 'a'),
 (27569, 'the'),
 (19512, 'and'),
 (15935, 'of'),
 (12623, 'is'),
 (11298, 'with'),
 (9466, 'to'),
 (9068, 'this'),
 (8471, 'i'),
 (8144, 'it')]

In [98]:
counts[5000:5010]

[(4, 'mist'),
 (4, 'minuscule'),
 (4, 'minimum'),
 (4, 'minimally'),
 (4, 'minerally'),
 (4, 'midtaste'),
 (4, 'midpoint'),
 (4, 'michelob'),
 (4, 'mgd'),
 (4, 'message')]

In [99]:
# Extract top words
nWords = 1000

In [100]:
words = [w[1] for w in counts[:nWords]]

In [102]:
words[:10]

['a', 'the', 'and', 'of', 'is', 'with', 'to', 'this', 'i', 'it']

In [86]:
counts[999]

(59, 'juice')

In [87]:
counts[1000]

(59, 'cherries')

In [88]:
counts[1000:1100]

[(59, 'cherries'),
 (59, 'barleywine'),
 (59, 'appropriate'),
 (59, 'apparent'),
 (59, 'already'),
 (58, 'thinner'),
 (58, 'sugars'),
 (58, 'particularly'),
 (58, 'months'),
 (58, 'hopping'),
 (58, 'frogs'),
 (58, 'forms'),
 (58, 'disappointing'),
 (58, 'cloves'),
 (58, 'area'),
 (57, 'terms'),
 (57, 'picking'),
 (57, 'job'),
 (57, 'inviting'),
 (57, 'heineken'),
 (57, 'disappointed'),
 (57, 'combination'),
 (57, 'bittering'),
 (57, 'barrelaged'),
 (56, 'smokiness'),
 (56, 'sitting'),
 (56, 'expensive'),
 (56, 'buds'),
 (56, 'aspect'),
 (56, 'addition'),
 (55, 'warmer'),
 (55, 'temperature'),
 (55, 'rounded'),
 (55, 'release'),
 (55, 'produced'),
 (55, 'none'),
 (55, 'manalishi'),
 (55, 'heavier'),
 (55, 'ends'),
 (55, 'deal'),
 (55, 'char'),
 (55, 'blast'),
 (54, 'velvety'),
 (54, 'us'),
 (54, 'tinge'),
 (54, 'thicker'),
 (54, 'slightest'),
 (54, 'single'),
 (54, 'lively'),
 (54, 'detected'),
 (54, 'depth'),
 (54, 'choice'),
 (54, 'brought'),
 (54, '94'),
 (53, 'whisky'),
 (53, 'somew

In [103]:
wordID = dict(zip(words, range(nWords)))
wordSet = set(words)

In [90]:
wordID["great"]

88

In [104]:
wordID['murky']

997

### Train a regressor

In [204]:
def feature(datum):
    bow = [0]*nWords # textbook shows how to use sparse matrix
    r = datum['review/text']
    r = r.lower()
    r = ''.join(c for c in r if c not in punct)
    ws = r.strip().split()
    for w in ws:
        if w in wordSet:
            bow[wordID[w]] += 1
    return bow + [1] # with the offset term at the end so word and weight match index

In [205]:
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]

In [107]:
X[0]

[4,
 3,
 3,
 1,
 0,
 2,
 1,
 0,
 0,
 0,
 2,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 2,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [108]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + L2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [111]:
theta[:20]

array([-0.0016723 ,  0.00288483, -0.00804013, -0.00051687, -0.00021117,
        0.00626303, -0.00246684, -0.00060558,  0.00369268,  0.01251837,
        0.01318618, -0.00180721, -0.00229857,  0.00011039,  0.01874699,
        0.06749427, -0.03537173,  0.00259992,  0.01630909,  0.00045103])

In [141]:
weights = list(zip(words + ["offset"], theta))

In [142]:
weights

[('a', -0.0016723019917445298),
 ('the', 0.00288482724430521),
 ('and', -0.008040125740582611),
 ('of', -0.0005168689483639612),
 ('is', -0.00021117082786689204),
 ('with', 0.006263030371554593),
 ('to', -0.002466840592418805),
 ('this', -0.0006055839936335295),
 ('i', 0.0036926765461781203),
 ('it', 0.012518367641352925),
 ('in', 0.013186181160550876),
 ('but', -0.0018072123143477255),
 ('beer', -0.0022985709892135065),
 ('that', 0.00011039493361541513),
 ('very', 0.01874698732545352),
 ('head', 0.06749427065446312),
 ('not', -0.035371729293846195),
 ('as', 0.0025999222994959625),
 ('for', 0.016309089863828445),
 ('on', 0.0004510268300275868),
 ('some', -0.018536492731756903),
 ('was', -0.011416824454330727),
 ('taste', -0.014872046300105982),
 ('nice', 0.04862938618229333),
 ('good', 0.04296364737113438),
 ('hops', 0.009783990197211418),
 ('light', -0.0048497821128991635),
 ('malt', 0.0068494105874552435),
 ('like', -0.009493580657509462),
 ('one', -0.015701951033398005),
 ('from', 0

In [143]:
weightD = dict(weights)

In [144]:
weightD

{'a': -0.0016723019917445298,
 'the': 0.00288482724430521,
 'and': -0.008040125740582611,
 'of': -0.0005168689483639612,
 'is': -0.00021117082786689204,
 'with': 0.006263030371554593,
 'to': -0.002466840592418805,
 'this': -0.0006055839936335295,
 'i': 0.0036926765461781203,
 'it': 0.012518367641352925,
 'in': 0.013186181160550876,
 'but': -0.0018072123143477255,
 'beer': -0.0022985709892135065,
 'that': 0.00011039493361541513,
 'very': 0.01874698732545352,
 'head': 0.06749427065446312,
 'not': -0.035371729293846195,
 'as': 0.0025999222994959625,
 'for': 0.016309089863828445,
 'on': 0.0004510268300275868,
 'some': -0.018536492731756903,
 'was': -0.011416824454330727,
 'taste': -0.014872046300105982,
 'nice': 0.04862938618229333,
 'good': 0.04296364737113438,
 'hops': 0.009783990197211418,
 'light': -0.0048497821128991635,
 'malt': 0.0068494105874552435,
 'like': -0.009493580657509462,
 'one': -0.015701951033398005,
 'from': 0.021107192566927814,
 'its': 0.01756388925028579,
 'at': -0.0

In [131]:
# Get list of weights
weights = list(zip(theta, words + ["offset"]))

In [132]:
weights.sort()

In [133]:
weights[-10:]

[(0.1559141004754918, 'easy'),
 (0.15603351131457754, 'keeps'),
 (0.16107702547600733, 'easily'),
 (0.16924664116619556, 'drank'),
 (0.17783412437243887, 'impressed'),
 (0.1785401820456934, 'summer'),
 (0.18035500950870711, 'wonderful'),
 (0.24608779528359034, 'always'),
 (0.24811453474900624, 'exceptional'),
 (3.505465514082514, 'offset')]

In [134]:
weights[:10]

[(-0.3794498090380151, 'skunk'),
 (-0.32472279919832603, 'skunky'),
 (-0.32183754597527015, 'bland'),
 (-0.2840036695545637, 'oh'),
 (-0.24921539254478614, 'water'),
 (-0.21551919066011638, 'visible'),
 (-0.2019952880495691, 'straw'),
 (-0.1893003552381132, 'flat'),
 (-0.183200966104203, 'unfortunately'),
 (-0.18097582750605248, 'recommend')]

In [136]:
wordID['exceptional']

990

In [137]:
wordCount['exceptional']

60

In [145]:
weightD['and']

-0.008040125740582611

In [146]:
weightD['it']

0.012518367641352925

In [147]:
weightD['this']

-0.0006055839936335295

In [203]:
def feature(datum):
    bow = [0]*nWords
    r = datum['review/text'].lower()
    r = ''.join(c for c in r if c not in punct)
    ws = r.strip().split()
    for w in ws:
        if w in wordSet:
            bow[wordID[w]] += 1
    return bow + [1]

### Fix the issue that "and", "of", etc have non-zero values
* Remove stopwords (as they do not convey information)
* n-grams representation

### n-grams

In [158]:
wordCount = defaultdict(int)
totalWords = 0

In [159]:
for d in data:
    r = d['review/text'].lower()
    r = ''.join(c for c in r if c not in punct)
    ws = r.split()
    bigrams = [' '.join(a) for a in zip(ws[:-1], ws[1:])]
    trigrams = [' '.join(a) for a in zip(ws[:-2], ws[1:-1], ws[2:])]
    for w in ws + bigrams + trigrams:
        totalWords += 1
        wordCount[w] += 1

In [160]:
counts = [(wordCount[w], w) for w in wordCount]

In [161]:
counts[:10]

[(30695, 'a'),
 (556, 'lot'),
 (15935, 'of'),
 (389, 'foam'),
 (6836, 'but'),
 (7033, 'in'),
 (27569, 'the'),
 (1771, 'smell'),
 (3946, 'some'),
 (151, 'banana')]

In [162]:
counts.sort(reverse=True)

In [163]:
counts[:10]

[(30695, 'a'),
 (27569, 'the'),
 (19512, 'and'),
 (15935, 'of'),
 (12623, 'is'),
 (11298, 'with'),
 (9466, 'to'),
 (9068, 'this'),
 (8471, 'i'),
 (8144, 'it')]

In [164]:
counts[100:110]

[(1228, 'white'),
 (1225, 'flavors'),
 (1217, 'by'),
 (1205, 'coffee'),
 (1192, 'pretty'),
 (1186, 'so'),
 (1173, 'this one'),
 (1171, 'black'),
 (1148, 'brown'),
 (1126, 'or')]

In [165]:
counts[1000:1010]

[(140, 'of pine'),
 (140, 'fluffy'),
 (140, 'buy'),
 (140, 'bold'),
 (139, 'orange with'),
 (139, 'malt backbone'),
 (139, 'hops with'),
 (139, 'highly'),
 (139, 'cinnamon and'),
 (139, 'apple')]

### Train a classifier

In [206]:
y_class =[d['beer/style'] == 'Hefeweizen' for d in data]

In [207]:
sum(y_class)

86

In [208]:
from sklearn.linear_model import LogisticRegression

In [209]:
clf = LogisticRegression()

In [210]:
clf.fit(X, y_class)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [211]:
theta = clf.coef_[0]
theta

array([ 0.0191795 ,  0.01778549, -0.01344871, ..., -0.13898004,
        0.18973879,  0.0014653 ])

In [212]:
weights = list(zip(theta, words + ['constant_feat']))

In [213]:
weights.sort()

In [214]:
weights[-10:]

[(0.666445702276773, 'super'),
 (0.6957535928008107, 'fruity'),
 (0.7170097046696343, 'yeasty'),
 (0.7534865170620338, 'yeast'),
 (0.8127129612864595, 'summer'),
 (0.9449896448808793, 'goes'),
 (1.0398536100825095, 'cloudy'),
 (1.0730339089618843, 'clove'),
 (1.839341424473342, 'wheat'),
 (1.9855558873756582, 'banana')]

### Multi-class classification

In [227]:
y_class = []

In [228]:
for d in data:
    if 'Ale' in d['beer/style']: y_class.append(0)
    elif 'IPA' in d['beer/style']: y_class.append(1)
    else: y_class.append(2)

In [229]:
from sklearn.linear_model import LogisticRegression

In [230]:
clf = LogisticRegression()

In [231]:
clf.fit(X, y_class)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [232]:
theta = clf.coef_[0]

In [233]:
weights = list(zip(theta, words + ['constant_feat']))

In [234]:
weights.sort()

In [235]:
weights[-10:]

[(1.0400930386144103, 'together'),
 (1.0436791961132668, 'tart'),
 (1.0831845447176731, 'noticed'),
 (1.0920262172309316, 'caldera'),
 (1.2615273422566264, 'ale'),
 (1.2730417430534757, 'ales'),
 (1.4558550438105546, 'pumpkin'),
 (1.4563883164182359, 'wee'),
 (1.7166492991251967, 'scotch'),
 (2.5592988346185828, 'apa')]