In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [9]:
# Open the file `alice-in-wonderland.txt`, and read it into a `pandas` series
# or data frame, such that each word is a separate value.
filename = '../data/alice-in-wonderland.txt'

s = Series(open(filename).read().split())
s.head()

0         ﻿The
1      Project
2    Gutenberg
3        EBook
4           of
dtype: object

In [10]:
s.value_counts().head(10)

the     732
and     362
a       321
to      311
of      300
in      211
she     197
was     160
said    129
it      122
Name: count, dtype: int64

In [11]:
(
    s
    .str
    .lower()
    .value_counts()
    .head(10)
)

the    792
and    379
a      325
to     318
of     313
she    232
in     222
was    160
you    141
it     136
Name: count, dtype: int64

In [6]:
# s = Series('this is a test 123 456'.split())
# # Getting the lengths of all strings in a series with .str.len()
# s.str.len()

0    4
1    2
2    1
3    4
4    3
5    3
dtype: int64

In [12]:
import string 

(
    s
    .str
    .strip(string.punctuation)
    .value_counts()
    .head(10)
)

the      735
and      384
a        322
to       320
of       303
in       214
she      201
was      167
Alice    166
it       164
Name: count, dtype: int64

In [14]:
# How many capitalized words does the book contain?

(
    s
    .loc[s.str.contains('^[A-Z]\w*$',
                        regex=True)]
    .count()
)

np.int64(1100)

In [15]:
words = {one_word.strip() for one_word in open('../data/words.txt')}


In [16]:
(
    s
    .str.strip(string.punctuation)      # Strip punctuation
    .loc[lambda s_: s_.str.isalpha()]   # Keep only those with letters
    .loc[lambda s_: ~s_.isin(words)]    # Now keep those *not* in the dictionary, and find the most common ones
    .value_counts()
)

Project      83
She          36
Rabbit       28
Queen        27
Gutenberg    27
             ..
reasons       1
knocked       1
curls         1
From          1
includes      1
Name: count, Length: 758, dtype: int64

In [17]:
# Open the file `winemag-150k-reviews.csv`, and read it into a data frame
filename = '../data/winemag-150k-reviews.csv'

df = pd.read_csv(filename,
                usecols=['country','province','description', 'variety'])
df.head()

Unnamed: 0,country,description,province,variety
0,US,This tremendous 100% varietal wine hails from ...,California,Cabernet Sauvignon
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Northern Spain,Tinta de Toro
2,US,Mac Watson honors the memory of a wine once ma...,California,Sauvignon Blanc
3,US,"This spent 20 months in 30% new French oak, an...",Oregon,Pinot Noir
4,France,"This is the top wine from La Bégude, named aft...",Provence,Provence red blend


In [18]:
# What are the 10 most common words containing 5 or more letters in the wine descriptions?
# Turn all words into lowercase, and remove all punctuation and symbols at the start or end of each word,
# for easier comparison.

# Also: remove the words flavors, aromas, finish, and drink.

def top_10_words(s):
    common_wine_words = ['flavors', 'aromas',
            'finish', 'drink', 'palate']
    
    words = (
        s
        .str.lower()
        .str.split()
        .explode()
        .str.strip(',$.?!$%')
    )

    return (
        words
        .loc[(words.str.len() >=5) &
             (~words.isin(common_wine_words))]
        .value_counts()
        .head(10)
    )

top_10_words(df['description'])
# How many wines are there per country?



description
fruit      56327
acidity    32536
tannins    32098
cherry     30639
black      24568
spice      22601
sweet      21243
notes      19581
fresh      17641
berry      17083
Name: count, dtype: int64

In [19]:
# What are the 10 most common words for non-California wines?
top_10_words(df.loc[df['province'] != 'California', 'description'])

description
fruit      46371
acidity    22270
tannins    21929
cherry     19440
spice      18522
black      17758
notes      16569
fresh      16200
berry      15478
sweet      12708
Name: count, dtype: int64

In [20]:
# What are the 10 most common words for French wines?
top_10_words(df.loc[df['country'] == 'France', 'description'])

description
fruit        8688
acidity      8632
tannins      6491
fruits       5449
fresh        4213
character    3494
black        3119
texture      3069
years        2880
crisp        2875
Name: count, dtype: int64

In [21]:
# What are the 10 most common words for white wines?

top_10_words(
    df
    .loc[df['variety']
         .isin(['Chardonnay', 
                'Sauvignon Blanc', 
                'Riesling']), 
    'description']
)

description
fruit         9133
acidity       8346
apple         5879
citrus        5368
crisp         4903
chardonnay    4871
green         4177
notes         4021
sweet         3850
pineapple     3847
Name: count, dtype: int64