In [2]:
import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict
import nltk # natural Language tool
import string # various string operations
from nltk.stem.porter import * # stemmer
from sklearn import linear_model
import ast
import gensim
from gensim.models import Word2Vec # Language model (probably in a couple of weeks...)

In [4]:
def parseData(fname):
    for l in urllib.urlopen(fname):
        yield ast.literal_eval(l)

In [5]:
def parseDataFromFile(fname):
    for l in open(fname):
        yield ast.literal_eval(l)

In [9]:
data_ = list(parseDataFromFile("beer_50000.json"))

In [10]:
data = data_[:5000]

In [11]:
data[0]

{'review/appearance': 2.5,
 'beer/style': 'Hefeweizen',
 'review/palate': 1.5,
 'review/taste': 1.5,
 'beer/name': 'Sausa Weizen',
 'review/timeUnix': 1234817823,
 'beer/ABV': 5.0,
 'beer/beerId': '47986',
 'beer/brewerId': '10325',
 'review/timeStruct': {'isdst': 0,
  'mday': 16,
  'hour': 20,
  'min': 57,
  'sec': 3,
  'mon': 2,
  'year': 2009,
  'yday': 47,
  'wday': 0},
 'review/overall': 1.5,
 'review/text': 'A lot of foam. But a lot.\tIn the smell some banana, and then lactic and tart. Not a good start.\tQuite dark orange in color, with a lively carbonation (now visible, under the foam).\tAgain tending to lactic sourness.\tSame for the taste. With some yeast and banana.',
 'user/profileName': 'stcules',
 'review/aroma': 2.0}

In [44]:
# Count words
wordCount = defaultdict(int)
totalWords = 0

In [45]:
punct = string.punctuation
stemmer = PorterStemmer()

In [46]:
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [47]:
for d in data:
    r = d['review/text']
    r = r.lower()
    r = ''.join(c for c in r if (not c in punct))
    words = r.split()
    for w in words:
        # w = stemmer.stem(w)
        totalWords += 1
        wordCount[w] += 1

In [48]:
totalWords

636392

In [49]:
len(wordCount)

19426

In [50]:
wordCount['the']

27569

In [51]:
# Build vector of word counts
counts = [(wordCount[w], w) for w in wordCount]

In [52]:
counts[:10]

[(30695, 'a'),
 (556, 'lot'),
 (15935, 'of'),
 (389, 'foam'),
 (6836, 'but'),
 (7033, 'in'),
 (27569, 'the'),
 (1771, 'smell'),
 (3946, 'some'),
 (151, 'banana')]

In [53]:
counts.sort(reverse=True)

In [56]:
counts[:10]

[(30695, 'a'),
 (27569, 'the'),
 (19512, 'and'),
 (15935, 'of'),
 (12623, 'is'),
 (11298, 'with'),
 (9466, 'to'),
 (9068, 'this'),
 (8471, 'i'),
 (8144, 'it')]

In [57]:
# Extract top words
nWords = 1000

In [58]:
words = [x[1] for x in counts[:nWords]]

In [59]:
counts[999]

(59, 'juice')

In [61]:
counts[1000:1100]

[(59, 'cherries'),
 (59, 'barleywine'),
 (59, 'appropriate'),
 (59, 'apparent'),
 (59, 'already'),
 (58, 'thinner'),
 (58, 'sugars'),
 (58, 'particularly'),
 (58, 'months'),
 (58, 'hopping'),
 (58, 'frogs'),
 (58, 'forms'),
 (58, 'disappointing'),
 (58, 'cloves'),
 (58, 'area'),
 (57, 'terms'),
 (57, 'picking'),
 (57, 'job'),
 (57, 'inviting'),
 (57, 'heineken'),
 (57, 'disappointed'),
 (57, 'combination'),
 (57, 'bittering'),
 (57, 'barrelaged'),
 (56, 'smokiness'),
 (56, 'sitting'),
 (56, 'expensive'),
 (56, 'buds'),
 (56, 'aspect'),
 (56, 'addition'),
 (55, 'warmer'),
 (55, 'temperature'),
 (55, 'rounded'),
 (55, 'release'),
 (55, 'produced'),
 (55, 'none'),
 (55, 'manalishi'),
 (55, 'heavier'),
 (55, 'ends'),
 (55, 'deal'),
 (55, 'char'),
 (55, 'blast'),
 (54, 'velvety'),
 (54, 'us'),
 (54, 'tinge'),
 (54, 'thicker'),
 (54, 'slightest'),
 (54, 'single'),
 (54, 'lively'),
 (54, 'detected'),
 (54, 'depth'),
 (54, 'choice'),
 (54, 'brought'),
 (54, '94'),
 (53, 'whisky'),
 (53, 'somew