In [1]:
import numpy as np
import pandas as pd
import string
from collections import defaultdict
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
from numpy.random import shuffle

# Get the Data

In [2]:
print "Reading data..."
data_df = pd.read_json("winemag-data-130k-v2.json",dtype={
    'points': np.int32,
    'price': np.float32,
})
print "done"

Reading data...
done


In [3]:
data = data_df.to_dict('records')

In [4]:
variety_num = data_df['variety'].value_counts()
print("The number of grape types used to make the wine: " + str(len(variety_num)))

The number of grape types used to make the wine: 707


In [5]:
print variety_num[:10]

Pinot Noir                  13272
Chardonnay                  11753
Cabernet Sauvignon           9472
Red Blend                    8946
Bordeaux-style Red Blend     6915
Riesling                     5189
Sauvignon Blanc              4967
Syrah                        4142
Rosé                         3564
Merlot                       3102
Name: variety, dtype: int64


In [6]:
variety_set = {u'Pinot Noir',u'Chardonnay',u'Cabernet Sauvignon',u'Red Blend',u'Bordeaux-style Red Blend',u'Riesling',u'Sauvignon Blanc',u'Syrah',u'Rosé',u'Merlot'}

In [7]:
data_new = []
for d in data:
    if d['variety'] in variety_set:
        data_new.append(d)

In [8]:
data = data_new
len(data)

71322

In [9]:
punctuation = set(string.punctuation)
number = set(['0','1','2','3','4','5','6','7','8','9'])
#stemmer = PorterStemmer()

# Split data

In [10]:
#shuffle then split
#train, valid, test: 1/3, 1/3, 1/3
shuffle(data)
train = data[:len(data)/3]
valid = data[len(data)/3:2*len(data)/3]
test  = data[2*len(data)/3:]

In [11]:
#Description
#1. Remove punctuation
#2. Remove number
#3. Stemming (NOT IN THIS CASE)
#4. Remove stop words

In [12]:
wordCount = defaultdict(int)
count = 0
for d in train:
    count += 1
    r = ''.join([c for c in d['description'].lower() if not c in punctuation.union(number)])
    for w in r.split():
        #w = stemmer.stem(w) # with stemming
        if w not in set(stopwords.words('english')):
            wordCount[w] += 1
    if count%10000==0:
        print count

10000
20000


In [13]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[0:1000]]

In [14]:
words

[u'wine',
 u'flavors',
 u'fruit',
 u'acidity',
 u'finish',
 u'palate',
 u'aromas',
 u'tannins',
 u'cherry',
 u'drink',
 u'black',
 u'ripe',
 u'oak',
 u'red',
 u'notes',
 u'rich',
 u'dry',
 u'spice',
 u'cabernet',
 u'nose',
 u'fresh',
 u'soft',
 u'blend',
 u'fruits',
 u'blackberry',
 u'plum',
 u'texture',
 u'shows',
 u'sweet',
 u'crisp',
 u'light',
 u'berry',
 u'apple',
 u'vanilla',
 u'dark',
 u'well',
 u'sauvignon',
 u'good',
 u'offers',
 u'full',
 u'merlot',
 u'raspberry',
 u'pepper',
 u'juicy',
 u'citrus',
 u'green',
 u'fruity',
 u'chocolate',
 u'years',
 u'bright',
 u'touch',
 u'firm',
 u'lemon',
 u'character',
 u'structure',
 u'white',
 u'balanced',
 u'pinot',
 u'currant',
 u'tart',
 u'spicy',
 u'dried',
 u'smooth',
 u'syrah',
 u'vineyard',
 u'peach',
 u'tannic',
 u'concentrated',
 u'bit',
 u'also',
 u'herbal',
 u'wood',
 u'made',
 u'flavor',
 u'toast',
 u'long',
 u'fine',
 u'chardonnay',
 u'herb',
 u'new',
 u'style',
 u'balance',
 u'theres',
 u'give',
 u'age',
 u'like',
 u'still',

In [15]:
#Description
#1. Remove punctuation
#2. Remove number
#3. Stemming
#4. Remove stop words

# Some functions

In [16]:
def cosine_similarity(x,y):
    numerator = np.dot(x,y)
    denominator = np.linalg.norm(x)*np.linalg.norm(y)
    return round(numerator/float(denominator),3)

def check_word(word):
    feat = [0]*len(data)
    for i in range(len(data)):
        r = ''.join([c for c in data[i]['description'].lower() if not c in punctuation.union(number)])
        if word in set(r.split()):
            feat[i] = 1
    return sum(feat)

def idf(word):
    return -math.log10(check_word(word)*1.0/len(data))

def tf(word,datum):
    r = ''.join([c for c in datum['description'].lower() if not c in punctuation])
    return r.split().count(word)

def tf_idf(word,datum):
    return tf(word,datum)*idf(word)

def performance(predictions, y):
    correct = [(a==b) for (a,b) in zip(predictions,y)]
    acc = sum(correct) * 1.0 / len(correct)
    return acc

# Baseline

In [29]:
#if a comment includes words of the category name, then return that category.
catDict = {
    u'Pinot Noir': 0,
    u'Chardonnay': 1,
    u'Cabernet Sauvignon': 2,
    u'Red Blend': 3,
    u'Bordeaux-style Red Blend': 4,
    u'Riesling': 5,
    u'Sauvignon Blanc': 6,
    u'Syrah': 7,
    u'Rosé': 8,
    u'Merlot': 9
}

In [32]:
baseline_predictions = []
#baseline model:
for d in test:
    words = d[u'description'].lower()
    cat = catDict[u'Pinot Noir']
    if 'pinot' in words or 'noir' in words:
        cat = catDict[u'Pinot Noir']
    elif 'chardonnay' in words:
        cat = catDict[u'Chardonnay']
    elif 'cabernet' in words or 'sauvignon' in words:
        cat = catDict[u'Cabernet Sauvignon']
    elif 'red' in words or 'blend' in words:
        cat = catDict[u'Red Blend']
    elif 'bordeaux' in words and ('red' in words or 'blend'): #special
        cat = catDict[u'Bordeaux-style Red Blend']
    elif 'riesling' in words:
        cat = catDict[u'Riesling']
    elif 'sauvignon' in words or 'blanc' in words:
        cat = catDict[u'Sauvignon Blanc']
    elif 'syrah' in words:
        cat = catDict[u'Syrah']
    elif 'ros' in words:
        cat = catDict[u'Rosé']
    elif 'merlot' in words:
        cat = catDict[u'Merlot']
    baseline_predictions.append(cat)

#evaluate baseline:
corrects = []
for d,p in zip(test,baseline_predictions):
    cat = catDict[d[u'variety']]
    corrects.append(cat==p)
sum(corrects)*1.0/len(test)

0.3345671742239421

# Multi-SVM 

In [17]:
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [26]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['description'].lower() if not c in punctuation.union(number)])
    for w in r.split():
        if w in wordSet:
            #feat[wordId[w]] += 1
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [27]:
X_train = [feature(r) for r in train]
X_valid = [feature(r) for r in valid]
X_test = [feature(r) for r in test]

y_train = [catDict[r['variety']] for r in train]
y_valid = [catDict[r['variety']] for r in valid]
y_test = [catDict[r['variety']] for r in test]

In [24]:
#multi class SVM:

clf_rbf = SVC(kernel='rbf')
clf_rbf.fit(X_train+X_valid, y_train+y_valid)

rbf_predicts = clf_rbf.predict(X_test)
rbf_acc = [a==b for a,b in zip(rbf_predic ts, y_test)]
rbf_acc = sum(rbf_acc)*1.0/len(rbf_acc)
print " rbf accuracy is ", rbf_acc

 rbf accuracy is  0.696979894002


In [None]:
#evaluate SVM
index = 0
for clf_rbf in clf_rbfs:
    rbf_predicts = clf_rbf.predict(X_test)
    rbf_acc = [a==b for a,b in zip(rbf_predicts, y_test)]
    rbf_acc = sum(rbf_acc)*1.0/len(rbf_acc)
    print "Gamma ", Gs[index], " rbf accuracy is ", rbf_acc
    index += 1

In [None]:
print 'hello world'

In [None]:
clfs = {}
for cat in range(10):
  y_trainC = [catDict[r['variety']] == cat for r in train]
  y_validC = [catDict[r['variety']] == cat for r in valid]
  bestAcc = 0
  bestCLF = None
  for c in 0.01, 0.1, 1, 10, 100, 3000:
    clf = SVC(C = c)
    clf.fit(X_train, y_trainC)
    predictions = [x for x in clf.predict(X_valid)]
    acc = [(x == y) for (x,y) in zip(predictions, y_validC)]
    acc = sum(acc) * 1.0 / len(acc)
    print("cat = " + str(cat) + ", C = " + str(c) + ": validation accuracy = " + str(acc))
    if acc > bestAcc:
      bestAcc = acc
      bestCLF = clf
  clfs[cat] = bestCLF

In [56]:
confidences = {}
for cat in range(10):
    confidences[cat] = clfs[cat].decision_function(X_valid)

predictions = []
for i in range(len(confidences[0])):
    cs = [(confidences[c][i],c) for c in range(10)]
    cs.sort()
    mostConfidentCategory = cs[-1][1]
    predictions.append(mostConfidentCategory)

validAcc = [(x == y) for (x,y) in zip(predictions, y_valid)]
validAcc = sum(validAcc) * 1.0 / len(validAcc)

print("Multi-SVM valid accuracy = " + str(validAcc))

Multi-SVM valid accuracy = 0.757676453268


In [57]:
confidences = {}
for cat in range(10):
    confidences[cat] = clfs[cat].decision_function(X_test)

predictions = []
for i in range(len(confidences[0])):
    cs = [(confidences[c][i],c) for c in range(10)]
    cs.sort()
    mostConfidentCategory = cs[-1][1]
    predictions.append(mostConfidentCategory)

testAcc = [(x == y) for (x,y) in zip(predictions, y_test)]
testAcc = sum(testAcc) * 1.0 / len(testAcc)

print("Multi-SVM test accuracy = " + str(testAcc))

Multi-SVM test accuracy = 0.757592327753
