In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from __future__ import unicode_literals

In [2]:
temp = pd.read_csv('winemag.csv', index_col = False)

In [3]:
from sklearn.model_selection import train_test_split
X = temp
X_train, X_test = train_test_split(X, test_size=0.7, random_state=42)

In [4]:
df = X_train

In [5]:
remove = ['Unnamed: 0', 'region_1', 'region_2', 'designation','taster_twitter_handle', 'province', 
          'title', 'winery','variety']
df = df.drop(remove,1)

In [6]:
df = df.reset_index(drop=True)

In [7]:
print(df.head())

  country                                        description  points  price  \
0   Italy  Pecorino is the white wine of the moment for t...      87   16.0   
1      US  Though not one of the producer's single-vineya...      94   38.0   
2      US  This is relatively light, with straightforward...      84   12.0   
3   Italy  Here's a powerfully fruit-driven Barbera d'Alb...      86    NaN   
4   Spain  Aromas of cola, raisin and cassis are ripe and...      91   45.0   

         taster_name  
0                NaN  
1      Matt Kettmann  
2         Jim Gordon  
3                NaN  
4  Michael Schachner  


In [8]:
#Fill NA of price with average price
df['price'].fillna((df['price'].mean()), inplace=True)

In [9]:
#Drop all other rows with country as NA.
df = df.dropna()

In [10]:
def text_cleaner(text):
    text = re.sub(r'--', '', text)
    text = re.sub('[\[].*?![\]]', "", text)
    text = ' '.join(text.split())
    return text

In [11]:
df['description'] = df['description'].apply(text_cleaner)

In [12]:
remove2 = ['country','price','points']
df = df.drop(remove2,1)

In [13]:
df.dtypes
print(df.head())

                                         description        taster_name
1  Though not one of the producer's single-vineya...      Matt Kettmann
2  This is relatively light, with straightforward...         Jim Gordon
4  Aromas of cola, raisin and cassis are ripe and...  Michael Schachner
5  This soft wine is a blend of 80% Malbec and 20...         Roger Voss
6  This Cab opens with soy and molasses and then ...     Virginie Boone


In [14]:
taster_names = df.taster_name.unique()

In [15]:
taster_reviews = []
for name in taster_names:
    temp_taster_reviews = df[df['taster_name']==name]
    temp_taster_reviews = " \n".join(temp_taster_reviews['description'])
    taster_reviews.append(temp_taster_reviews)

In [16]:
nlp = spacy.load('en')

In [17]:
taster_doc = []
for i in range(len(taster_names)):
    taster_doc.append(nlp(taster_reviews[i]))

In [18]:
taster_sents = []
for i in range(len(taster_doc)):
    taster_sents.append([[sent, taster_names[i]] for sent in taster_doc[i].sents])

In [19]:
sentences = []
for taster_sent in taster_sents:
    sentences = sentences+taster_sent
sentences = pd.DataFrame(sentences)

In [20]:
print(sentences.head())

                                                   0              1
0  (Though, not, one, of, the, producer, 's, sing...  Matt Kettmann
1  (Tart, mulberry, ,, Kalamata, olive, ,, hibisc...  Matt Kettmann
2  (It, delivers, an, excellent, combination, of,...  Matt Kettmann
3  (Perhaps, the, palest, Central, Coast, rosé, o...  Matt Kettmann
4  (The, strong, acidity, tantalizes, the, mouth,...  Matt Kettmann


In [21]:
sentences['sentence_length']=sentences[0].str.len()
print(sentences.head())

                                                   0              1  \
0  (Though, not, one, of, the, producer, 's, sing...  Matt Kettmann   
1  (Tart, mulberry, ,, Kalamata, olive, ,, hibisc...  Matt Kettmann   
2  (It, delivers, an, excellent, combination, of,...  Matt Kettmann   
3  (Perhaps, the, palest, Central, Coast, rosé, o...  Matt Kettmann   
4  (The, strong, acidity, tantalizes, the, mouth,...  Matt Kettmann   

   sentence_length  
0               24  
1               35  
2               13  
3               43  
4               21  


In [22]:
from collections import Counter
# Utility function to create a list of the 300 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(300)]
#item 0 is the word, item 1 is the count.

In [23]:
#Create a df with features for each word in our common word set.
#Each value is the count of the times the word appears.
#BOW is bag of words

def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df.loc[:, 'punctuation_length'] = 0
    df.loc[:, 'unique_words'] = 0
    df['taster_name'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]

        example_words = [token.lemma_
                        for token in sentence 
                        if (not token.is_punct)]         
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        puncts = [token for token in sentence if (token.is_punct)]
        df.loc[i,'punctuation_length'] += len(puncts)
  
        #Populate row with unique_words
        unique_words = [token for token in example_words]
        df.loc[i, 'unique_words'] += len(unique_words)
        

        # This counter is just to make sure the kernel didn't hang.
        if i % 1000 == 0:
            print("Processing row {}".format(i))
    return df

In [24]:
taster_words = []
for doc in taster_doc:
    taster_words.append(bag_of_words(doc))

In [25]:
common_words = []
for taster_word in taster_words:
    common_words = common_words+taster_word
common_words = set(common_words)

In [26]:
word_counts = bow_features(sentences,common_words)
word_counts.head()

Processing row 0
Processing row 1000
Processing row 2000
Processing row 3000
Processing row 4000
Processing row 5000
Processing row 6000
Processing row 7000
Processing row 8000
Processing row 9000
Processing row 10000
Processing row 11000
Processing row 12000
Processing row 13000
Processing row 14000
Processing row 15000
Processing row 16000
Processing row 17000
Processing row 18000
Processing row 19000
Processing row 20000
Processing row 21000
Processing row 22000
Processing row 23000
Processing row 24000
Processing row 25000
Processing row 26000
Processing row 27000
Processing row 28000
Processing row 29000
Processing row 30000
Processing row 31000
Processing row 32000
Processing row 33000
Processing row 34000
Processing row 35000
Processing row 36000
Processing row 37000
Processing row 38000
Processing row 39000
Processing row 40000
Processing row 41000
Processing row 42000
Processing row 43000
Processing row 44000
Processing row 45000
Processing row 46000
Processing row 47000
Proce

Unnamed: 0,freesia,grainy,butterscotch,toasty,distinctive,lychee,menthol,paso,noir,spring,...,touriga,lasting,zestiness,day,tone,pinot,text_sentence,punctuation_length,unique_words,taster_name
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Though, not, one, of, the, producer, 's, sing...",3,21,Matt Kettmann
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Tart, mulberry, ,, Kalamata, olive, ,, hibisc...",8,27,Matt Kettmann
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(It, delivers, an, excellent, combination, of,...",2,11,Matt Kettmann
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,1,"(Perhaps, the, palest, Central, Coast, rosé, o...",4,39,Matt Kettmann
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(The, strong, acidity, tantalizes, the, mouth,...",2,19,Matt Kettmann


In [28]:
#Concat the number of words in that sentence into the df.
word_counts = pd.concat([word_counts, sentences['sentence_length']], axis=1)

In [29]:
#Include column with number of words in the previous sentence.
#Create new column. Set to NULL
#word_counts['previous_length'] = word_counts['sentence_length']
word_counts['previous_length'] = None

for i in range(1,word_counts.shape[0]):
    word_counts.loc[i,'previous_length'] = word_counts.loc[i-1,'sentence_length']

In [30]:
word_counts['diff'] = (word_counts.taster_name.ne(word_counts.taster_name.shift())).astype(int)

In [31]:
word_counts.loc[word_counts['diff'] == 1, 'previous_length'] = None

In [32]:
word_counts['next_length'] = None
for i in range(0,word_counts.shape[-1]):
    word_counts.loc[i,'next_length'] = word_counts.loc[i+1,'sentence_length']

In [33]:
word_counts['diff2'] = (word_counts.taster_name.ne(word_counts.taster_name.shift(-1))).astype(int)
word_counts.loc[word_counts['diff2'] == 1, 'next_length'] = None

In [34]:
print(word_counts.head())

   freesia  grainy  butterscotch  toasty  distinctive  lychee  menthol  paso  \
0        0       0             0       0            0       0        0     0   
1        0       0             0       0            0       0        0     0   
2        0       0             0       0            0       0        0     0   
3        0       0             0       0            0       0        0     0   
4        0       0             0       0            0       0        0     0   

   noir  spring  ...    pinot  \
0     0       0  ...        0   
1     0       0  ...        0   
2     0       0  ...        0   
3     1       0  ...        1   
4     0       0  ...        0   

                                       text_sentence  punctuation_length  \
0  (Though, not, one, of, the, producer, 's, sing...                   3   
1  (Tart, mulberry, ,, Kalamata, olive, ,, hibisc...                   8   
2  (It, delivers, an, excellent, combination, of,...                   2   
3  (Perhaps, the

In [35]:
word_counts['previous_length'] = word_counts['previous_length'].fillna(word_counts['previous_length'].mean())
word_counts['next_length'] = word_counts['next_length'].fillna(word_counts['next_length'].mean())

In [36]:
#Trying random forest
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
y = word_counts['taster_name']
X = np.array(word_counts.drop(['text_sentence','taster_name','diff','diff2'],1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
train = rfc.fit(X_train, y_train)

print('Training set score', rfc.score(X_train, y_train))
print('\nTest set score', rfc.score(X_test, y_test))

from sklearn.model_selection import cross_val_score
rf_scores = cross_val_score(rfc, X, y, cv=5)
print(rf_scores)
print ('\nMean cross validation score is: ' + str(np.mean(rf_scores)))

Training set score 0.972130141191

Test set score 0.632429446158




[ 0.64124245  0.63742522  0.6276259   0.61963155  0.6261876 ]

Mean cross validation score is: 0.630422542815


In [37]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
train = lr.fit(X_train, y_train)

print('Training set score', lr.score(X_train, y_train))
print('\nTest set score', lr.score(X_test, y_test))

from sklearn.model_selection import cross_val_score
lr_scores = cross_val_score(lr, X, y, cv=5)
print(lr_scores)
print ('\nMean cross validation score is: ' + str(np.mean(lr_scores)))

Training set score 0.797498465316

Test set score 0.7685189448




[ 0.76422203  0.76719972  0.76500719  0.76776051  0.76714458]

Mean cross validation score is: 0.766266807958


In [38]:
#Naive Bayes Classifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
    
#Testing Naive Bayes Classifier
BNB = BernoulliNB()
BNB.fit(X_train, y_train)

print('Training set score', BNB.score(X_train, y_train))
print('\nTest set score', BNB.score(X_test, y_test))

BNB_scores = cross_val_score(BNB, X, y, cv=5)
print(BNB_scores)
print ('\nMean cross validation score is: ' + str(np.mean(BNB_scores)))

Training set score 0.729619398404

Test set score 0.723079047926




[ 0.72269198  0.72359641  0.72155396  0.71876799  0.71854667]

Mean cross validation score is: 0.721031400585


In [47]:
#Testing the model on MLP Classifier. 

from sklearn.neural_network import MLPClassifier #Multi Layer Perceptron

#Establish and fit the model, with mutiple 500 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(800, 800), max_iter=100, batch_size=500, learning_rate_init=0.0001)
mlp.fit(X_train, y_train)

print('Training set score', mlp.score(X_train, y_train))
print('Test set score', mlp.score(X_test, y_test))

Training set score 0.962845303867
Test set score 0.750563970351


In [48]:
MLP_score = cross_val_score(mlp, X, y, cv=5)



array([ 0.76853609,  0.76984584,  0.77835971,  0.7716753 ,  0.76806587])

#max_iter=100, batch_size=500
#single layer (500, ) - Training set score 0.928959484346, Test set score 0.745637861977
#double layer (500, 500) - Training set score 0.958425414365, Test set score 0.754891579577
#double layer (500,500) with LR 0.01 - Training set score 0.907642725599, Test set score 0.740527600018
#double layer (500, 500) with LR 0.0001 - Training set score 0.964840392879, Test set score 0.744578978868
#double layer (300, 300) with LR 0.0001 -Training set score 0.964825046041, Test set score 0.737489065881
#double layer (800, 800) with LR 0.0001 - Training set score 0.962845303867, Test set score 0.750563970351

In [None]:
#Higher alpha overfits less.

In [53]:
#MLP classifier adjusting for alpha.
mlp = MLPClassifier(hidden_layer_sizes=(800, 800), max_iter=100, batch_size=100, 
                    learning_rate_init=0.001, alpha=0.6)
mlp.fit(X_train, y_train)

print('Training set score', mlp.score(X_train, y_train))
print('Test set score', mlp.score(X_test, y_test))

Training set score 0.825352977287
Test set score 0.766401178583


In [None]:
#double layer (500, 500), max_iter=100, batch_size=100, learning_rate_init=0.001, alpha=0.6 - Training set score 0.828376304481, Test set score 0.767644215276

In [54]:
MLP_scores = cross_val_score(mlp, X, y, cv=5)
print(MLP_scores)
print ('\nMean cross validation score is: ' + str(np.mean(MLP_scores)))



[ 0.7029048   0.70887023  0.68166906  0.69447323  0.70184833]

Mean cross validation score is: 0.697953131197
