# NLP on description of the wine review

In [12]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import nltk

In [None]:
def filtered_sequence(input_sequence):
    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    input_sequence = input_sequence.translate(str.maketrans('', '', string.punctuation))
    sequence_tokens = word_tokenize(input_sequence)
    filtered_sentence = [w for w in sequence_tokens if not w.lower() in stop_words]
    filtered_sentence = [ps.stem(w) for w in filtered_sentence]

    return filtered_sentence

In [13]:
example_sent = """This is a sample sentence,
                  showing off the stop words filtration."""

filtered_sequence(example_sent)

['sampl', 'sentenc', 'show', 'stop', 'word', 'filtrat']

In [14]:
df = pd.read_csv('data/winedata.csv', index_col='index')
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,polarity_text,subjectivity_text
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,0.133333,0.733333
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,0.22,0.46
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,0.025,0.358333
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,0.166667,0.333333
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,0.306667,0.448718


In [15]:
df['Stem word desc'] = df['description'].apply(lambda x: filtered_sequence(x))

In [16]:
df['Stem word desc']

0         [aroma, includ, tropic, fruit, broom, brimston...
1         [ripe, fruiti, wine, smooth, still, structur, ...
2         [tart, snappi, flavor, lime, flesh, rind, domi...
3         [pineappl, rind, lemon, pith, orang, blossom, ...
4         [much, like, regular, bottl, 2012, come, acros...
                                ...                        
129966    [note, honeysuckl, cantaloup, sweeten, delici,...
129967    [citat, given, much, decad, bottl, age, prior,...
129968    [welldrain, gravel, soil, give, wine, crisp, d...
129969    [dri, style, pinot, gri, crisp, acid, also, we...
129970    [big, rich, offdri, power, intens, spici, roun...
Name: Stem word desc, Length: 129971, dtype: object

In [17]:
#count apparitions of each word in the description
count = {}
for i in df['Stem word desc']:
    for j in i:
        if j in count:
            count[j] += 1
        else:
            count[j] = 1

In [18]:
count_sorted = sorted(count.items(), key=lambda x: x[1], reverse=True)

count_sorted

[('wine', 82799),
 ('flavor', 70022),
 ('fruit', 58680),
 ('aroma', 41019),
 ('finish', 40428),
 ('acid', 39249),
 ('palat', 38282),
 ('tannin', 32813),
 ('drink', 32635),
 ('cherri', 31641),
 ('ripe', 28737),
 ('black', 25422),
 ('note', 25248),
 ('dri', 24429),
 ('spice', 23000),
 ('rich', 21833),
 ('fresh', 21402),
 ('red', 19110),
 ('show', 18541),
 ('berri', 17827),
 ('oak', 17146),
 ('nose', 16934),
 ('offer', 16128),
 ('plum', 15498),
 ('textur', 15468),
 ('blend', 14961),
 ('blackberri', 14923),
 ('sweet', 14856),
 ('appl', 14615),
 ('balanc', 14454),
 ('soft', 14120),
 ('crisp', 13611),
 ('age', 13219),
 ('structur', 12928),
 ('white', 12476),
 ('fruiti', 11921),
 ('light', 11811),
 ('dark', 11414),
 ('hint', 11382),
 ('citru', 11375),
 ('bright', 11337),
 ('give', 11117),
 ('raspberri', 10848),
 ('miner', 10836),
 ('herb', 10809),
 ('cabernet', 10699),
 ('vanilla', 10690),
 ('well', 10618),
 ('touch', 10556),
 ('full', 10216),
 ('juici', 10157),
 ('pepper', 10083),
 ('good', 

In [19]:
#keep only the most frequent words
count_sorted = count_sorted[:20]
count_sorted

[('wine', 82799),
 ('flavor', 70022),
 ('fruit', 58680),
 ('aroma', 41019),
 ('finish', 40428),
 ('acid', 39249),
 ('palat', 38282),
 ('tannin', 32813),
 ('drink', 32635),
 ('cherri', 31641),
 ('ripe', 28737),
 ('black', 25422),
 ('note', 25248),
 ('dri', 24429),
 ('spice', 23000),
 ('rich', 21833),
 ('fresh', 21402),
 ('red', 19110),
 ('show', 18541),
 ('berri', 17827)]

In [20]:
#create a list of the most frequent words
most_frequent_words = [i[0] for i in count_sorted]
most_frequent_words

['wine',
 'flavor',
 'fruit',
 'aroma',
 'finish',
 'acid',
 'palat',
 'tannin',
 'drink',
 'cherri',
 'ripe',
 'black',
 'note',
 'dri',
 'spice',
 'rich',
 'fresh',
 'red',
 'show',
 'berri']

In [21]:
#keep only the most frequent words in the description
df['Stem word desc'] = df['Stem word desc'].apply(lambda x: [i for i in x if i in most_frequent_words])

df['Stem word desc']

0                 [aroma, fruit, dri, palat, dri, acid]
1         [ripe, wine, tannin, red, berri, fruit, acid]
2                          [flavor, acid, flavor, wine]
3                          [aroma, palat, note, finish]
4                                                [wine]
                              ...                      
129966                              [note, palat, acid]
129967                   [drink, cherri, flavor, fruit]
129968            [wine, dri, ripe, spice, wine, drink]
129969          [dri, acid, spice, flavor, wine, drink]
129970                             [rich, fruit, drink]
Name: Stem word desc, Length: 129971, dtype: object

In [22]:
#create dummy variables for each word
for i in most_frequent_words:
    df[i] = df['Stem word desc'].apply(lambda x: 1 if i in x else 0)
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,...,ripe,black,note,dri,spice,rich,fresh,red,show,berri
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,...,0,0,0,1,0,0,0,0,0,0
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,...,1,0,0,0,0,0,0,1,0,1
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,0,0,0,0,0,0,0,0,0,0
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,...,0,0,1,0,0,0,0,0,0,0
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,...,0,0,0,0,0,0,0,0,0,0


In [27]:
new_df = df.copy()
new_df = new_df.iloc[:,15:]

In [28]:
new_df.to_csv('data/stem.csv')