In [13]:
import numpy as np
import pandas as pd
import string
from collections import defaultdict
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
from numpy.random import shuffle

# Get the Data

In [3]:
print "Reading data..."
data_df = pd.read_json("winemag-data-130k-v2.json",dtype={
    'points': np.int32,
    'price': np.float32,
})
print "done"

Reading data...
done


In [4]:
data = data_df.to_dict('records')

In [5]:
variety_num = data_df['variety'].value_counts()
print("The number of grape types used to make the wine: " + str(len(variety_num)))

The number of grape types used to make the wine: 707


In [6]:
print variety_num[:10]

Pinot Noir                  13272
Chardonnay                  11753
Cabernet Sauvignon           9472
Red Blend                    8946
Bordeaux-style Red Blend     6915
Riesling                     5189
Sauvignon Blanc              4967
Syrah                        4142
Rosé                         3564
Merlot                       3102
Name: variety, dtype: int64


In [7]:
variety_set = {u'Pinot Noir',u'Chardonnay',u'Cabernet Sauvignon',u'Red Blend',u'Bordeaux-style Red Blend',u'Riesling',u'Sauvignon Blanc',u'Syrah',u'Rosé',u'Merlot'}

In [8]:
data_new = []
for d in data:
    if d['variety'] in variety_set:
        data_new.append(d)

In [9]:
data = data_new
len(data)

71322

In [10]:
punctuation = set(string.punctuation)
number = set(['0','1','2','3','4','5','6','7','8','9'])
#stemmer = PorterStemmer()

In [12]:
wordCount = defaultdict(int)
count = 0
for d in data:
    count += 1
    r = ''.join([c for c in d['description'].lower() if not c in punctuation.union(number)])
    for w in r.split():
        #w = stemmer.stem(w) # with stemming
        if w not in set(stopwords.words('english')):
            wordCount[w] += 1
    if count%1000==0:
        print count

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000


KeyboardInterrupt: 

In [266]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[0:100]]

In [267]:
words

[u'wine',
 u'flavors',
 u'fruit',
 u'aromas',
 u'palate',
 u'acidity',
 u'finish',
 u'tannins',
 u'drink',
 u'cherry',
 u'ripe',
 u'black',
 u'notes',
 u'red',
 u'spice',
 u'rich',
 u'fresh',
 u'nose',
 u'oak',
 u'berry',
 u'dry',
 u'plum',
 u'soft',
 u'fruits',
 u'blend',
 u'apple',
 u'crisp',
 u'blackberry',
 u'offers',
 u'sweet',
 u'texture',
 u'white',
 u'shows',
 u'light',
 u'citrus',
 u'dark',
 u'bright',
 u'vanilla',
 u'well',
 u'cabernet',
 u'full',
 u'pepper',
 u'juicy',
 u'fruity',
 u'good',
 u'raspberry',
 u'firm',
 u'green',
 u'peach',
 u'touch',
 u'lemon',
 u'character',
 u'chocolate',
 u'dried',
 u'balanced',
 u'pear',
 u'years',
 u'structure',
 u'sauvignon',
 u'spicy',
 u'smooth',
 u'pinot',
 u'made',
 u'concentrated',
 u'herb',
 u'tannic',
 u'also',
 u'note',
 u'herbal',
 u'tart',
 u'like',
 u'wood',
 u'flavor',
 u'hint',
 u'licorice',
 u'mineral',
 u'fine',
 u'bit',
 u'long',
 u'still',
 u'mouth',
 u'give',
 u'merlot',
 u'creamy',
 u'theres',
 u'currant',
 u'clean',
 u

In [245]:
#Description
#1. Remove punctuation
#2. Remove number
#3. Stemming
#4. Remove stop words

# Some functions

In [None]:
def cosine_similarity(x,y):
    numerator = np.dot(x,y)
    denominator = np.linalg.norm(x)*np.linalg.norm(y)
    return round(numerator/float(denominator),3)

def check_word(word):
    feat = [0]*len(data)
    for i in range(len(data)):
        r = ''.join([c for c in data[i]['description'].lower() if not c in punctuation.union(number)])
        if word in set(r.split()):
            feat[i] = 1
    return sum(feat)

def idf(word):
    return -math.log10(check_word(word)*1.0/len(data))

def tf(word,datum):
    r = ''.join([c for c in datum['description'].lower() if not c in punctuation])
    return r.split().count(word)

def tf_idf(word,datum):
    return tf(word,datum)*idf(word)


wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
def feature(datum):
  feat = [0]*len(words)
  r = ''.join([c for c in d['description'].lower() if not c in punctuation.union(number)])
  for w in r.split():
    if w in words:
      feat[wordId[w]] += 1
  feat.append(1) #offset
  return feat

# Split data

In [17]:
#shuffle then split
#train, valid, test: 1/3, 1/3, 1/3
shuffle(data)
train = data[:len(data)/3]
valid = data[len(data)/3:2*len(data)/3]
test  = data[2*len(data)/3:]


{u'country': u'US',
 u'description': u"Primary, concentrated raspberry and cherry flavors pierce through the core of this rich, fruity Pinot. Subtle oak aging lends delicate vanilla, spice and nut tones, but it's an elegant, finely structured wine with crisp acidity and fine, persistent tannins. Drinks well now but should improve through 2021.",
 u'designation': u'Shared Table Farm',
 u'points': 91,
 u'price': 50.0,
 u'province': u'New York',
 u'region_1': u'North Fork of Long Island',
 u'region_2': u'Long Island',
 u'taster_name': u'Anna Lee C. Iijima',
 u'taster_twitter_handle': None,
 u'title': u'Anthony Nappa 2014 Shared Table Farm Pinot Noir (North Fork of Long Island)',
 u'variety': u'Pinot Noir',
 u'winery': u'Anthony Nappa'}

# Baseline 1：
if a comment includes words of the category name, then return that category.

In [26]:
#if a comment includes words of the category name, then return that category.
catDict = {
    u'Pinot Noir': 0,
    u'Chardonnay': 1,
    u'Cabernet Sauvignon': 2,
    u'Red Blend': 3,
    u'Bordeaux-style Red Blend': 4,
    u'Riesling': 5,
    u'Sauvignon Blanc': 6,
    u'Syrah': 7,
    u'Rosé': 8,
    u'Merlot': 9
}
baseline_predictions = []
#baseline model:
for d in test:
    words = d[u'description'].lower()
    cat = catDict[u'Pinot Noir']
    if 'pinot' in words or 'noir' in words:
        cat = catDict[u'Pinot Noir']
    if 'chardonnay' in words:
        cat = catDict[u'Chardonnay']
    if 'cabernet' in words or 'sauvignon' in words:
        cat = catDict[u'Cabernet Sauvignon']
    if 'red' in words or 'blend':
        cat = catDict[u'Red Blend']
    if 'bordeaux' in words and ('red' in words or 'blend'): #special
        cat = catDict[u'Bordeaux-style Red Blend']
    if 'riesling' in words:
        cat = catDict[u'Riesling']
    if 'sauvignon' in words or 'blanc' in words:
        cat = catDict[u'Sauvignon Blanc']
    if 'syrah' in words:
        cat = catDict[u'Syrah']
    if 'ros' in words:
        cat = catDict[u'Rosé']
    if 'merlot' in words:
        cat = catDict[u'Merlot']
    baseline_predictions.append(cat)

#evaluate baseline:
corrects = []
for d,p in zip(test,baseline_predictions):
    cat = catDict[d[u'variety']]
    corrects.append(cat==p)
acc = sum(corrects)*1.0 / len(corrects)
print acc

0.182005552284


In [28]:
corrects = []
for d,p in zip(test,baseline_predictions):
    cat = catDict[d[u'variety']]
    corrects.append(cat==0)
acc = sum(corrects)*1.0 / len(corrects)
print acc

0.187599899049


# Baseline 2:
frequency of 1000 most frequen unigrams as freature + linearSVC

In [None]:
train