In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# Open the file `winemag-150k-reviews.csv`, and read it into a data frame
filename = '../data/winemag-150k-reviews.csv'

df = pd.read_csv(filename,
                usecols=['country','province','description', 'variety'])
df.head()

Unnamed: 0,country,description,province,variety
0,US,This tremendous 100% varietal wine hails from ...,California,Cabernet Sauvignon
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Northern Spain,Tinta de Toro
2,US,Mac Watson honors the memory of a wine once ma...,California,Sauvignon Blanc
3,US,"This spent 20 months in 30% new French oak, an...",Oregon,Pinot Noir
4,France,"This is the top wine from La Bégude, named aft...",Provence,Provence red blend


In [3]:
# What are the 10 most common words containing 5 or more letters in the wine descriptions?
# Turn all words into lowercase, and remove all punctuation and symbols at the start or end of each word,
# for easier comparison.

# Also: remove the words flavors, aromas, finish, and drink.

def top_10_words(s):
    common_wine_words = ['flavors', 'aromas',
            'finish', 'drink', 'palate']

    words = (
        s
        .str.lower()
        .str.split()
        .explode()
        .str.strip(',$.?!$%')
        )

    return (
        words
        .loc[(words.str.len()>=5) &
             (~words.isin(common_wine_words))]
        .value_counts()
        .head(10)
    )



# def top_10_words(s):
#     words = s.str.lower().str.split().explode().str.strip(',$.?!$%')
#     common_wine_words = ['flavors', 'aromas', 'finish', 'drink', 'palate']
#     return words[(words.str.len()>=5) & (~words.isin(common_wine_words))].value_counts().head(10)

top_10_words(df['description'])

description
fruit      56327
acidity    32536
tannins    32098
cherry     30639
black      24568
spice      22601
sweet      21243
notes      19581
fresh      17641
berry      17083
Name: count, dtype: int64

In [4]:
# What are the 10 most common words for non-California wines?
top_10_words(df.loc[df['province'] != 'California', 'description'])

description
fruit      46371
acidity    22270
tannins    21929
cherry     19440
spice      18522
black      17758
notes      16569
fresh      16200
berry      15478
sweet      12708
Name: count, dtype: int64

In [5]:
# What are the 10 most common words for French wines?
top_10_words(df.loc[df['country'] == 'France', 'description'])

description
fruit        8688
acidity      8632
tannins      6491
fruits       5449
fresh        4213
character    3494
black        3119
texture      3069
years        2880
crisp        2875
Name: count, dtype: int64

In [6]:
# What are the 10 most common words for white wines?

top_10_words(
    df
    .loc[df['variety']
         .isin(['Chardonnay', 
                'Sauvignon Blanc', 
                'Riesling']), 
    'description']
)

description
fruit         9133
acidity       8346
apple         5879
citrus        5368
crisp         4903
chardonnay    4871
green         4177
notes         4021
sweet         3850
pineapple     3847
Name: count, dtype: int64

In [7]:
# What are the 10 most common words for red wines?
top_10_words(
    df
    .loc[df['variety']
         .isin(['Pinot Noir', 
                'Cabernet Sauvignon', 
                'Syrah', 'Merlot', 
                'Zinfandel']),
    'description']
)

description
fruit         15011
cherry        14024
tannins       13138
black          9535
blackberry     6764
acidity        6338
pinot          6326
sweet          5982
cherries       5370
shows          5337
Name: count, dtype: int64

In [8]:
# What are the 10 most common words for rosé wines?
top_10_words(df
             .loc[df['variety'] == 'Rosé', 
             'description'])

description
acidity       1135
fruit          696
crisp          669
fresh          622
strawberry     534
light          514
raspberry      509
cherry         469
fruity         428
fruits         419
Name: count, dtype: int64

In [9]:
# Show the 10 most common words for the 5 most common wine varieties.

top_10_words(df
             .loc[df['variety']
                  .isin(df['variety']
                  .value_counts()
                  .head(5)
                  .index), 
             'description'])

description
fruit       22784
tannins     15968
cherry      13974
acidity     12496
black       11219
cabernet     9433
spice        7890
sweet        7873
blend        7563
shows        7264
Name: count, dtype: int64