In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from __future__ import unicode_literals

In [2]:
temp = pd.read_csv('winemag.csv', index_col = False)

In [3]:
from sklearn.model_selection import train_test_split
X = temp
X_train, X_test = train_test_split(X, test_size=0.8, random_state=42)

In [4]:
df = X_train

In [5]:
remove = ['Unnamed: 0', 'region_1', 'region_2', 'designation','taster_twitter_handle', 'province', 
          'title', 'winery','variety']
df = df.drop(remove,1)

In [6]:
df = df.reset_index(drop=True)

In [7]:
print(df.head())

     country                                        description  points  \
0  Argentina  Like almost every wine in Trapiche's single-vi...      93   
1         US  From Paso Robles's warmer eastside, this wine ...      92   
2      Italy  This is a bright and perky red wine with well-...      89   
3         US  This wine is a blend of 40% each Mourvèdre and...      88   
4     Israel  Light cassis and black currant aromas softly a...      84   

   price        taster_name  
0   50.0  Michael Schachner  
1   38.0      Matt Kettmann  
2   25.0                NaN  
3   20.0   Sean P. Sullivan  
4   22.0      Lauren Buzzeo  


In [8]:
#Fill NA of price with average price
df['price'].fillna((df['price'].mean()), inplace=True)

In [9]:
#Drop all other rows with country as NA.
df = df.dropna()

In [10]:
def text_cleaner(text):
    text = re.sub(r'--', '', text)
    text = re.sub('[\[].*?![\]]', "", text)
    text = ' '.join(text.split())
    return text

In [11]:
df['description'] = df['description'].apply(text_cleaner)

In [12]:
def quartile(value):
    if value > 91:
        return 1
    elif value > 88:
        return 2
    elif value > 86:
        return 3
    elif value > 79:
        return 4

In [13]:
df['quartile'] = df['points'].apply(quartile)

In [14]:
remove2 = ['country','price','points','taster_name']
df = df.drop(remove2,1)

In [15]:
df.dtypes
print(df.head())

                                         description  quartile
0  Like almost every wine in Trapiche's single-vi...         1
1  From Paso Robles's warmer eastside, this wine ...         1
3  This wine is a blend of 40% each Mourvèdre and...         3
4  Light cassis and black currant aromas softly a...         4
6  Concentrated flavors and a richly tannic textu...         2


In [16]:
q = []
for i in range(1,5):
    qtemp = df[df['quartile']==i]
    qtemp = " \n".join(qtemp['description'])
    q.append(qtemp)

In [17]:
q1 = df[df['quartile']==1]
q1=" \n".join(q1['description'])

In [18]:
q2 = df[df['quartile']==2]
q2=" \n".join(q2['description'])

In [19]:
q3 = df[df['quartile']==3]
q3=" \n".join(q3['description'])

In [20]:
q4 = df[df['quartile']==4]
q4=" \n".join(q4['description'])

In [21]:
nlp = spacy.load('en')

In [22]:
q1_doc = nlp(q1)

In [23]:
q2_doc = nlp(q2)

In [24]:
q3_doc = nlp(q3)

In [25]:
q4_doc = nlp(q4)

In [26]:
#Group into sentences
q1_sents = [[sent, "1"] for sent in q1_doc.sents]
q2_sents = [[sent, '2'] for sent in q2_doc.sents]
q3_sents = [[sent, "3"] for sent in q3_doc.sents]
q4_sents = [[sent, '4'] for sent in q4_doc.sents]

#Combine the sentences from the 4 quartiles into one df.
sentences = pd.DataFrame(q1_sents + q2_sents + q3_sents + q4_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Like, almost, every, wine, in, Trapiche, 's, ...",1
1,"(This, feels, layered, and, plush, ,, while, f...",1
2,"(Drink, through, 2018, ., \n)",1
3,"(From, Paso, Robles, 's, warmer, eastside, ,, ...",1
4,"(It, 's, immediately, delicious, once, sipped,...",1


In [27]:
sentences['sentence_length']=sentences[0].str.len()
print(sentences.head())

                                                   0  1  sentence_length
0  (Like, almost, every, wine, in, Trapiche, 's, ...  1               33
1  (This, feels, layered, and, plush, ,, while, f...  1               27
2                      (Drink, through, 2018, ., \n)  1                5
3  (From, Paso, Robles, 's, warmer, eastside, ,, ...  1               23
4  (It, 's, immediately, delicious, once, sipped,...  1               31


In [28]:
from collections import Counter
# Utility function to create a list of the 300 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(300)]
#item 0 is the word, item 1 is the count.

In [29]:
#Create a df with features for each word in our common word set.
#Each value is the count of the times the word appears.
#BOW is bag of words

def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df.loc[:, 'punctuation_length'] = 0
    df['quartile'] = sentences[1]
    df.loc[:, common_words] = 0
    df.loc[:, 'unique_words'] = 0
    #loc you use the column name, iloc you use index (column number)
    #: means all rows. columns within common_words would be 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]

        example_words = [token.lemma_
                        for token in sentence 
                        if (not token.is_punct)]       
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        puncts = [token for token in sentence if (token.is_punct)]
        df.loc[i,'punctuation_length'] += len(puncts)
        
        #Populate row with unique_words
        unique_words = [token for token in example_words]
        df.loc[i, 'unique_words'] += len(unique_words)
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 1000 == 0:
            print("Processing row {}".format(i))
    return df

In [30]:
q1words = bag_of_words(q1_doc)
q2words = bag_of_words(q2_doc)
q3words = bag_of_words(q3_doc)
q4words = bag_of_words(q4_doc)
common_words = set(q1words + q2words + q3words + q4words)

In [31]:
word_counts = bow_features(sentences,common_words)
word_counts.head()

Processing row 0
Processing row 1000
Processing row 2000
Processing row 3000
Processing row 4000
Processing row 5000
Processing row 6000
Processing row 7000
Processing row 8000
Processing row 9000
Processing row 10000
Processing row 11000
Processing row 12000
Processing row 13000
Processing row 14000
Processing row 15000
Processing row 16000
Processing row 17000
Processing row 18000
Processing row 19000
Processing row 20000
Processing row 21000
Processing row 22000
Processing row 23000
Processing row 24000
Processing row 25000
Processing row 26000
Processing row 27000
Processing row 28000
Processing row 29000
Processing row 30000
Processing row 31000
Processing row 32000
Processing row 33000
Processing row 34000
Processing row 35000
Processing row 36000
Processing row 37000
Processing row 38000
Processing row 39000
Processing row 40000
Processing row 41000
Processing row 42000
Processing row 43000
Processing row 44000
Processing row 45000
Processing row 46000
Processing row 47000
Proce

Unnamed: 0,bold,flat,winery,make,good,franc,basic,candy,acid,banana,...,prune,coffee,attractive,pinot,focus,burn,text_sentence,punctuation_length,quartile,unique_words
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Like, almost, every, wine, in, Trapiche, 's, ...",6,1,27
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,"(This, feels, layered, and, plush, ,, while, f...",4,1,23
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Drink, through, 2018, ., \n)",1,1,4
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(From, Paso, Robles, 's, warmer, eastside, ,, ...",3,1,20
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(It, 's, immediately, delicious, once, sipped,...",5,1,26


In [32]:
#Concat the number of words in that sentence into the df.
word_counts = pd.concat([word_counts, sentences['sentence_length']], axis=1)

In [33]:
word_counts['previous_length'] = None

for i in range(1,word_counts.shape[0]):
    word_counts.loc[i,'previous_length'] = word_counts.loc[i-1,'sentence_length']

In [34]:
word_counts['diff'] = (word_counts.quartile.ne(word_counts.quartile.shift())).astype(int)
word_counts.loc[word_counts['diff'] == 1, 'previous_length'] = None

In [35]:
word_counts['next_length'] = None
for i in range(0,word_counts.shape[-1]):
    word_counts.loc[i,'next_length'] = word_counts.loc[i+1,'sentence_length']

In [36]:
word_counts['diff2'] = (word_counts.quartile.ne(word_counts.quartile.shift(-1))).astype(int)
word_counts.loc[word_counts['diff2'] == 1, 'next_length'] = None

In [37]:
word_counts['previous_length'] = word_counts['previous_length'].fillna(word_counts['previous_length'].mean())
word_counts['next_length'] = word_counts['next_length'].fillna(word_counts['next_length'].mean())

In [None]:
Y = word_counts['quartile']
X = np.array(word_counts.drop(['text_sentence','quartile','diff','diff2'],1))

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [38]:
#Trying random forest
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
train = rfc.fit(X_train, y_train)

print('Training set score', rfc.score(X_train, y_train))
print('\nTest set score', rfc.score(X_test, y_test))

from sklearn.model_selection import cross_val_score
cross_val_score(rfc, X, Y, cv=5)

Training set score 0.95700409704

Test set score 0.381205551336


array([ 0.38534565,  0.37645637,  0.3866402 ,  0.3813552 ,  0.38340672])

In [39]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score', lr.score(X_train, y_train))
print('\nTest set score', lr.score(X_test, y_test))

from sklearn.model_selection import cross_val_score
cross_val_score(lr, X, Y, cv=5)

(43446, 426) (43446,)
Training set score 0.451848271417

Test set score 0.427673824484


array([ 0.43281263,  0.42616726,  0.43557435,  0.4246871 ,  0.42415609])

In [None]:
Random forest shows major overfitting of data. Seems like words do not seem to be good predictor of which quartile 
the wine would belong in as it only shows ~40% accuracy if we were to use Quartile as a target variable.


In [None]:
Running the Tfidf Model. 

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(word_counts, test_size=0.25, random_state=0)

vectorizer = TfidfVectorizer(
                            stop_words='english', 
                            lowercase=True, #convert everything to lower case
                            use_idf=True, #use inverse document frequencies in our weighting
                            norm=u'l2', #applies a correction factor so short and long paragraphs are treated equally
                            smooth_idf=True, #adds 1 to all document frequencies, prevents divide by 0 errors
                            ngram_range=(0,3),
                            analyzer='word'
                           )

#applying the vectorizer
word_counts_tfidf = vectorizer.fit_transform(word_counts)
print('Number of features: %d' % word_counts_tfidf.get_shape()[1])

#Split into train and test.
X_train_tfidf, X_test_tfidf = train_test_split(word_counts_tfidf, test_size=0.25, random_state=42)
#Reshape the vectorizer
X_train_tfidf_csr = X_train_tfidf.tocsr() #Return a copy of this matrix in Compressed Sparse Row format

#Number of paragraphs
n = X_train_tfidf_csr.shape[0]

#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]

#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, list the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

Number of features: 411


In [62]:
from sklearn.decomposition import TruncatedSVD #Singular Value Decomposition
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Reduce the feature space to 150
svd = TruncatedSVD(150)
lsa = make_pipeline(svd, Normalizer(copy=False)) #LSA is latent semantic analysis

#Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained = svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print('Percent variance captured by all components:', total_variance*100)

Percent variance captured by all components: 50.1021083294


In [None]:
#Analyze what sorts of paragraphs our solution considers similar, for the first five identified topics.
paras_by_component = pd.DataFrame(X_train_lsa, index=X_train)

for i in range(5):
    print('Component {}'.format(i))
print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])