In [None]:
# use kaggle data to train models
# then will test on my scraped dataset

#### Objective: 
I plan to use the Word2Vec algorithm to train a new set of embeddings representative of the wine domain, then use those embeddings to fit a LTSM classification model. I also will plan on implementing a more simple, baseline model such as an SVM. 

#### Prep

In [31]:
# set working directory
import os
import sys
project_root = '/Users/kgedney/Documents/georgetown/anly580/anly580-wine-project'
os.chdir(project_root)

In [42]:
# install packages
import numpy as np
import pandas as pd


from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [33]:
# kaggle load data 
# ref: https://www.kaggle.com/zynicide/wine-reviews#winemag-data_first150k.csv
df_full = pd.read_csv('data/winemag-data_first150k.csv', encoding='utf8')

In [34]:
df_full.columns

Index(['Unnamed: 0', 'country', 'description', 'designation', 'points',
       'price', 'province', 'region_1', 'region_2', 'variety', 'winery'],
      dtype='object')

#### Define Classes

In [35]:
# select wines with over 1050 tasting notes (to make an even 30 varities)
variety_counts = pd.DataFrame(df_full['variety'].value_counts())
varieties      = list(variety_counts.index[variety_counts['variety'] > 1050])
len(varieties)

30

In [36]:
df = df_full[df_full['variety'].isin(varieties)]

In [37]:
df.variety.value_counts()

Chardonnay                       14482
Pinot Noir                       14291
Cabernet Sauvignon               12800
Red Blend                        10062
Bordeaux-style Red Blend          7347
Sauvignon Blanc                   6320
Syrah                             5825
Riesling                          5524
Merlot                            5070
Zinfandel                         3799
Sangiovese                        3345
Malbec                            3208
White Blend                       2824
Rosé                              2817
Tempranillo                       2556
Nebbiolo                          2241
Portuguese Red                    2216
Sparkling Blend                   2004
Shiraz                            1970
Corvina, Rondinella, Molinara     1682
Rhône-style Red Blend             1505
Barbera                           1365
Pinot Gris                        1365
Cabernet Franc                    1363
Sangiovese Grosso                 1346
Pinot Grigio             

In [38]:
# map classes to integers
df['variety_code'] = pd.Categorical(df.variety).codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [40]:
# set classes
x = df.description.values
y = df.variety_code.values

In [43]:
# split test and train
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=522)

#### Baseline: Linear SVM

In [51]:
# majority class prediction
np.mean(y_test == 6)

0.11562917662306103

In [44]:
# data preprocess
tfidf_vec   = TfidfVectorizer()
x_train_vec = tfidf_vec.fit_transform(x_train)
x_test_vec  = tfidf_vec.transform(x_test)
x_train_vec.shape

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(98761, 26413)

In [45]:
# fit model
from sklearn.svm import LinearSVC
clf             = LinearSVC().fit(x_train_vec, y_train)
predicted       = clf.predict(x_test_vec)
predicted_score = clf.decision_function(x_test_vec)
print('accuracy', metrics.accuracy_score(y_test, predicted))

accuracy 0.8066906970151068


In [54]:
# grid search on C

def fit_svc(C=1):
    clf             = LinearSVC(C=C).fit(x_train_vec, y_train)
    predicted       = clf.predict(x_test_vec)
    predicted_score = clf.decision_function(x_test_vec)
    return metrics.accuracy_score(y_test, predicted)

for C in [0.1, 0.5, 1, 2, 4, 8]:
    print(fit_svc(C=C))


0.7500708760277024
0.7957150378680491
0.8066906970151068
0.8143048074197076
0.8170183467660281
0.8142238062451905


#### Preprocess Data for Keras Models

In [56]:
import keras
from keras import optimizers
import tensorflow as tf

from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from keras.models import Model, Sequential
from keras.layers import Input, Dense, CuDNNLSTM, LSTM, Embedding, Bidirectional, GlobalAveragePooling1D, Conv1D, Activation, Flatten, Dropout, MaxPooling1D, Embedding, GlobalMaxPooling1D

from keras.layers.core import Dense, Dropout

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [57]:
# data preprocess
x = df.description.values
y = df.variety_code.values

# create sequences
max_features = 25000
tokenizer    = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(x)
x_sequences  = tokenizer.texts_to_sequences(x)

# pad each sequence to be max length
maxlen = max(len(x) for x in x_sequences)
print(maxlen)
x_sequences = sequence.pad_sequences(x_sequences, maxlen)

136


In [61]:
# split test and train
x_train, x_test, y_train, y_test = train_test_split(x_sequences, y, test_size=0.20, random_state=22)

#### 1. Simple Model

In [62]:
model1 = Sequential()
model1.add(Embedding(input_dim=max_features,
                    output_dim=128))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
model1.add(GlobalAveragePooling1D())
model1.add(Dropout(0.5))

model1.add(Dense(100, activation="softmax"))

In [63]:
opt = keras.optimizers.Adam(lr=0.01) # speed up optimization
model1.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["acc"])

In [64]:
history1 = model1.fit(x_train, y_train,
            batch_size=256,
            epochs=16,
            validation_data=(x_test, y_test))

Train on 98761 samples, validate on 24691 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [65]:
print('accuracy', model1.evaluate(x_test, y_test)[1])

accuracy 0.8191648778834874


In [66]:
print('train accuracy', model1.evaluate(x_train, y_train)[1])

train accuracy 0.9514383207952491


#### Fast Text