In [1]:
import numpy as np
import pandas as pd
import string
from collections import defaultdict
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
from numpy.random import shuffle
import math

# Get the Data

In [3]:
print "Reading data..."
data_df = pd.read_json("winemag-data-130k-v2.json",dtype={
    'points': np.int32,
    'price': np.float32,
})
print "done"

Reading data...
done


In [4]:
data = data_df.to_dict('records')

In [5]:
variety_num = data_df['variety'].value_counts()
print("The number of grape types used to make the wine: " + str(len(variety_num)))

The number of grape types used to make the wine: 707


In [6]:
print variety_num[:10]

Pinot Noir                  13272
Chardonnay                  11753
Cabernet Sauvignon           9472
Red Blend                    8946
Bordeaux-style Red Blend     6915
Riesling                     5189
Sauvignon Blanc              4967
Syrah                        4142
Rosé                         3564
Merlot                       3102
Name: variety, dtype: int64


In [7]:
variety_set = {u'Pinot Noir',u'Chardonnay',u'Cabernet Sauvignon',u'Red Blend',u'Bordeaux-style Red Blend',u'Riesling',u'Sauvignon Blanc',u'Syrah',u'Rosé',u'Merlot'}

In [8]:
data_new = []
for d in data:
    if d['variety'] in variety_set:
        data_new.append(d)

In [9]:
data = data_new
len(data)

71322

In [10]:
punctuation = set(string.punctuation)
number = set(['0','1','2','3','4','5','6','7','8','9'])
#stemmer = PorterStemmer()

# Split data

In [11]:
#shuffle then split
#train, valid, test: 1/3, 1/3, 1/3
shuffle(data)
train = data[:len(data)/3]
valid = data[len(data)/3:2*len(data)/3]
test  = data[2*len(data)/3:]

In [12]:
#Description
#1. Remove punctuation
#2. Remove number
#3. Stemming (NOT IN THIS CASE)
#4. Remove stop words

In [13]:
wordCount = defaultdict(int)
count = 0
for d in train:
    count += 1
    r = ''.join([c for c in d['description'].lower() if not c in punctuation.union(number)])
    for w in r.split():
        #w = stemmer.stem(w) # with stemming
        if w not in set(stopwords.words('english')):
            wordCount[w] += 1
    if count%10000==0:
        print count

10000
20000


In [14]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[0:4000]]

In [15]:
#Description
#1. Remove punctuation
#2. Remove number
#3. Stemming
#4. Remove stop words

# Some functions

In [16]:
def cosine_similarity(x,y):
    numerator = np.dot(x,y)
    denominator = np.linalg.norm(x)*np.linalg.norm(y)
    return round(numerator/float(denominator),3)

def check_word(word):
    feat = [0]*len(data)
    for i in range(len(data)):
        r = ''.join([c for c in data[i]['description'].lower() if not c in punctuation.union(number)])
        if word in set(r.split()):
            feat[i] = 1
    return sum(feat)

def idf(word):
    return -math.log10(check_word(word)*1.0/len(data))

def tf(word,datum):
    r = ''.join([c for c in datum['description'].lower() if not c in punctuation])
    return r.split().count(word)

def tf_idf(word,datum):
    return tf(word,datum)*idf(word)

def performance(predictions, y):
    correct = [(a==b) for (a,b) in zip(predictions,y)]
    acc = sum(correct) * 1.0 / len(correct)
    return acc

# Baseline

In [17]:
#if a comment includes words of the category name, then return that category.
catDict = {
    u'Pinot Noir': 0,
    u'Chardonnay': 1,
    u'Cabernet Sauvignon': 2,
    u'Red Blend': 3,
    u'Bordeaux-style Red Blend': 4,
    u'Riesling': 5,
    u'Sauvignon Blanc': 6,
    u'Syrah': 7,
    u'Rosé': 8,
    u'Merlot': 9
}
baseline_predictions = []
#baseline model:
for d in test:
    words = d[u'description'].lower()
    cat = catDict[u'Pinot Noir']
    if 'pinot' in words or 'noir' in words:
        cat = catDict[u'Pinot Noir']
    if 'chardonnay' in words:
        cat = catDict[u'Chardonnay']
    if 'cabernet' in words or 'sauvignon' in words:
        cat = catDict[u'Cabernet Sauvignon']
    if 'red' in words or 'blend':
        cat = catDict[u'Red Blend']
    if 'bordeaux' in words and ('red' in words or 'blend'): #special
        cat = catDict[u'Bordeaux-style Red Blend']
    if 'riesling' in words:
        cat = catDict[u'Riesling']
    if 'sauvignon' in words or 'blanc' in words:
        cat = catDict[u'Sauvignon Blanc']
    if 'syrah' in words:
        cat = catDict[u'Syrah']
    if 'ros' in words:
        cat = catDict[u'Rosé']
    if 'merlot' in words:
        cat = catDict[u'Merlot']
    baseline_predictions.append(cat)

#evaluate baseline:
corrects = []
for d,p in zip(test,baseline_predictions):
    cat = catDict[d[u'variety']]
    corrects.append(cat==p)
sum(corrects)*1.0/len(test)

0.18103810885841676

# Multi-SVM 

In [18]:
from sklearn import svm
from sklearn.svm import LinearSVC

In [19]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['description'].lower() if not c in punctuation.union(number)])
    for w in r.split():
        if w in wordSet:
            #feat[wordId[w]] += 1
            feat[wordId[w]] = 1
    feat.append(1) #offset
    return feat

In [178]:
X_train = [feature(r) for r in train]
X_valid = [feature(r) for r in valid]
X_test = [feature(r) for r in test]

y_train = [catDict[r['variety']] for r in train]
y_valid = [catDict[r['variety']] for r in valid]
y_test = [catDict[r['variety']] for r in test]

In [179]:
clfs = {}
for cat in range(10):
  y_trainC = [catDict[r['variety']] == cat for r in train]
  y_validC = [catDict[r['variety']] == cat for r in valid]
  bestAcc = 0
  bestCLF = None
  for c in 0.01, 0.1, 1, 10, 100, 3000:
    clf = svm.LinearSVC(C = c)
    clf.fit(X_train, y_trainC)
    predictions = [x for x in clf.predict(X_valid)]
    acc = [(x == y) for (x,y) in zip(predictions, y_validC)]
    acc = sum(acc) * 1.0 / len(acc)
    print("cat = " + str(cat) + ", C = " + str(c) + ": validation accuracy = " + str(acc))
    if acc > bestAcc:
      bestAcc = acc
      bestCLF = clf
  clfs[cat] = bestCLF

cat = 0, C = 0.01: validation accuracy = 0.918692689493
cat = 0, C = 0.1: validation accuracy = 0.921132329436
cat = 0, C = 1: validation accuracy = 0.920585513586
cat = 0, C = 10: validation accuracy = 0.918398250189
cat = 0, C = 100: validation accuracy = 0.892445528729
cat = 0, C = 3000: validation accuracy = 0.894001850761
cat = 1, C = 0.01: validation accuracy = 0.94754774123
cat = 1, C = 0.1: validation accuracy = 0.948220745352
cat = 1, C = 1: validation accuracy = 0.945696979894
cat = 1, C = 10: validation accuracy = 0.943131151678
cat = 1, C = 100: validation accuracy = 0.925548918987
cat = 1, C = 3000: validation accuracy = 0.921973584588
cat = 2, C = 0.01: validation accuracy = 0.920543450829
cat = 2, C = 0.1: validation accuracy = 0.91873475225
cat = 2, C = 1: validation accuracy = 0.917430806764
cat = 2, C = 10: validation accuracy = 0.915159417851
cat = 2, C = 100: validation accuracy = 0.898334314798
cat = 2, C = 3000: validation accuracy = 0.882981408261
cat = 3, C = 0.

In [180]:
confidences = {}
for cat in range(10):
    confidences[cat] = clfs[cat].decision_function(X_valid)

predictions = []
for i in range(len(confidences[0])):
    cs = [(confidences[c][i],c) for c in range(10)]
    cs.sort()
    mostConfidentCategory = cs[-1][1]
    predictions.append(mostConfidentCategory)

validAcc = [(x == y) for (x,y) in zip(predictions, y_valid)]
validAcc = sum(validAcc) * 1.0 / len(validAcc)

print("Multi-SVM valid accuracy = " + str(validAcc))

Multi-SVM valid accuracy = 0.757970892572


In [181]:
confidences = {}
for cat in range(10):
    confidences[cat] = clfs[cat].decision_function(X_test)

predictions = []
for i in range(len(confidences[0])):
    cs = [(confidences[c][i],c) for c in range(10)]
    cs.sort()
    mostConfidentCategory = cs[-1][1]
    predictions.append(mostConfidentCategory)

testAcc = [(x == y) for (x,y) in zip(predictions, y_test)]
testAcc = sum(testAcc) * 1.0 / len(testAcc)

print("Multi-SVM test accuracy = " + str(testAcc))

Multi-SVM test accuracy = 0.750483721713


# tf_idf feature

In [31]:
wordCount2 = defaultdict(int)
count2 = 0
#stemmer = PorterStemmer()
for d in train:
    count2 += 1
    r = ''.join([c for c in d['description'].lower() if not c in punctuation.union(number)])
    for w in r.split():
        #w = stemmer.stem(w) # with stemming
        if w not in set(stopwords.words('english')):
            wordCount2[w] += 1
        #if count2%1000==0:
            #print count2
        
counts2 = [(wordCount2[w], w) for w in wordCount2]
counts2.sort()
counts2.reverse()

words2 = [x[1] for x in counts2[0:1000]]
wordId2 = dict(zip(words2, range(len(words2))))
wordSet2 = set(words2)

In [32]:
def check_word(word):
    feat = [0]*len(train)
    for i in range(len(train)):
        r = ''.join([c for c in train[i]['description'].lower() if not c in punctuation.union(number)])
        if word in set(r.split()):
            feat[i] = 1
    return sum(feat)

def idf(word):
    return math.log10(len(train)*1.0/check_word(word))

def tf(word,datum):
    r = ''.join([c for c in datum['description'].lower() if not c in punctuation.union(number)])
    return r.split().count(word)

def tf_idf(word,datum):
    return tf(word,datum)*idf(word)

def getIDF(datum,word):
    count = 0
    for d in datum:
        r = ''.join([c for c in d['description'].lower() if not c in punctuation])
        rList = r.split()
        if word in rList:
            count += 1
        #if count == 0:
            #print word
    return math.log(len(datum)/count*1.0,10)

def getTF(data,word):
    count = 0
    r = ''.join([c for c in data['description'].lower() if not c in punctuation])
    rList = r.split()
    for l in rList:
        if l == word:
            count += 1
    return count * 1.0

In [34]:
#word_idf = [getIDF(train, w) for w in words2]

In [44]:
def feature_tf_idf(datum):
    feat = [0]*len(words2)
    for w in words2:
        feat[wordId2[w]] = getTF(datum, w)*(word_idf[wordId2[w]])
    feat.append(1) #offset
    return feat

In [59]:
y_train = [catDict[r['variety']] for r in train]
print 'finish y train'
y_valid = [catDict[r['variety']] for r in valid]
print 'finish y valid'
y_test = [catDict[r['variety']] for r in test]
print 'finish y test'


X_train = [feature_tf_idf(r) for r in train]
print 'finish x train'
X_valid = [feature_tf_idf(r) for r in valid]
print 'finish x valid'
X_test = [feature_tf_idf(r) for r in test]
print 'finish x test'

finish y train
finish y valid
finish y test
finish x train
finish x valid
finish x test


In [60]:
clfs = {}
for cat in range(10):
  y_trainC = [catDict[r['variety']] == cat for r in train]
  y_validC = [catDict[r['variety']] == cat for r in valid]
  bestAcc = 0
  bestCLF = None
  for c in 0.01, 0.1, 1, 10, 100, 3000:
    clf = svm.LinearSVC(C = c)
    clf.fit(X_train, y_trainC)
    predictions = [x for x in clf.predict(X_valid)]
    acc = [(x == y) for (x,y) in zip(predictions, y_validC)]
    acc = sum(acc) * 1.0 / len(acc)
    print("cat = " + str(cat) + ", C = " + str(c) + ": validation accuracy = " + str(acc))
    if acc > bestAcc:
      bestAcc = acc
      bestCLF = clf
  clfs[cat] = bestCLF

cat = 0, C = 0.01: validation accuracy = 0.917304618491
cat = 0, C = 0.1: validation accuracy = 0.916841928157
cat = 0, C = 1: validation accuracy = 0.91642130058
cat = 0, C = 10: validation accuracy = 0.912425338605
cat = 0, C = 100: validation accuracy = 0.89438041558
cat = 0, C = 3000: validation accuracy = 0.888407503996
cat = 1, C = 0.01: validation accuracy = 0.946327921259
cat = 1, C = 0.1: validation accuracy = 0.946496172289
cat = 1, C = 1: validation accuracy = 0.943257339951
cat = 1, C = 10: validation accuracy = 0.942331959283
cat = 1, C = 100: validation accuracy = 0.928745688567
cat = 1, C = 3000: validation accuracy = 0.929418692689
cat = 2, C = 0.01: validation accuracy = 0.92016488601
cat = 2, C = 0.1: validation accuracy = 0.918145873643
cat = 2, C = 1: validation accuracy = 0.917683183309
cat = 2, C = 10: validation accuracy = 0.914149911668
cat = 2, C = 100: validation accuracy = 0.867165811391
cat = 2, C = 3000: validation accuracy = 0.856397745436
cat = 3, C = 0.0

In [61]:
confidences = {}
for cat in range(10):
    confidences[cat] = clfs[cat].decision_function(X_valid)

predictions = []
for i in range(len(confidences[0])):
    cs = [(confidences[c][i],c) for c in range(10)]
    cs.sort()
    mostConfidentCategory = cs[-1][1]
    predictions.append(mostConfidentCategory)

validAcc = [(x == y) for (x,y) in zip(predictions, y_valid)]
validAcc = sum(validAcc) * 1.0 / len(validAcc)

print("Multi-SVM valid accuracy = " + str(validAcc))

Multi-SVM valid accuracy = 0.753932867839


In [62]:
confidences = {}
for cat in range(10):
    confidences[cat] = clfs[cat].decision_function(X_test)

predictions = []
for i in range(len(confidences[0])):
    cs = [(confidences[c][i],c) for c in range(10)]
    cs.sort()
    mostConfidentCategory = cs[-1][1]
    predictions.append(mostConfidentCategory)

testAcc = [(x == y) for (x,y) in zip(predictions, y_test)]
testAcc = sum(testAcc) * 1.0 / len(testAcc)

print("Multi-SVM test accuracy = " + str(testAcc))

Multi-SVM test accuracy = 0.759022461513
