In [1]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
%matplotlib notebook

In [2]:
file = '../data/spain-red'
df = pd.read_csv(file + '.csv', converters={'foods': literal_eval, 'highlights': literal_eval})

In [3]:
diacritics = {'&#237;': 'í', '&#243;': 'ó', '&#250;': 'ú', '&#241;': 'ñ', '&#232;': 'è', \
    '&#8364;': '€', '&#193;': 'Á', '&#192;': 'À', '&#233;': 'é', '&#224;': 'à', \
    '&#239;': 'ï', '&#231;': 'ç', '&#193;': 'Á', '&#252;': 'ü', '&#225;': 'á', '&#186;': 'º', \
    '&#244;': 'ô', '&#8217;': '’', '&#960;': 'π', '&#210;': 'Ò'}
df.replace(diacritics, regex=True, inplace=True)
df['winery'] = df['winery'].apply(lambda x: re.sub(r'((?<=[a-z_à-ÿ])[A-Z]|(?<!\A)[A-Z](?=[a-z_à-ÿ]))', r' \1', x))
df['vintage'] = df['vintage'].apply(lambda x: re.sub(r'((?<=[a-z_à-ÿ])[A-Z]|(?<!\A)[A-Z](?=[a-z_à-ÿ]))', r' \1', x))
df['region'] = df['region'].apply(lambda x: re.sub(r'((?<=[a-z_à-ÿ])[A-Z]|(?<!\A)[A-Z](?=[a-z_à-ÿ]))', r' \1', x))
df['vintage'] = df['vintage'].apply(lambda x: re.sub(r"(\S)\(", r'\1 (', x))
df['vintage'] = df['vintage'].apply(lambda x: re.sub('\(\s*(.*?)\s*\)', r'(\1)', x))
df['vintage'] = df['vintage'].apply(lambda x: re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", x).strip())
df['vintage'] = df['vintage'].replace(r'\s+', ' ', regex=True)

numeric_columns = ['year', 'rating', 'ratings_count', 'country_rank', 'region_rank', \
                   'winery_rank', 'global_rank', 'alcohol', 'body', 'acidity', 'price']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

df = df.drop(['drink_from', 'drink_until'], axis = 1)

In [4]:
df.describe()

Unnamed: 0,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
count,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,1656.0,1900.0,1900.0,2013.0
mean,4.147992,744.478433,3.93059,5.692117,15.629152,4.140803,14.081582,4.271579,2.963158,87.068867
std,0.24401,1714.88318,3.456955,5.452801,12.844511,3.42667,1.634316,0.507897,0.188424,234.235746
min,3.7,25.0,1.0,1.0,1.0,1.0,0.0,3.0,2.0,4.9
25%,3.9,86.0,1.0,2.0,6.0,1.0,14.0,4.0,3.0,18.95
50%,4.1,227.0,3.0,4.0,12.0,3.0,14.5,4.0,3.0,31.13
75%,4.3,659.0,5.0,8.0,22.0,6.0,14.5,5.0,3.0,61.95
max,4.9,32378.0,22.0,38.0,81.0,20.0,16.5,5.0,3.0,3404.94


In [5]:
# Best value wine: cheaper wine with rating more than median (4.1) and number of ratings more than median value (227).
# Best-value wines are 15.8% (318) of all wines in the dataset (2017).

best_value = df[(df['price'] < 100) & (df['rating'] > 4.1) & (df['ratings_count'] > 227)]
best_value.describe()

Unnamed: 0,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
count,318.0,318.0,318.0,318.0,318.0,318.0,296.0,302.0,302.0,318.0
mean,4.307233,1337.666667,1.081761,2.097484,11.789308,1.279874,14.137162,4.334437,2.966887,49.219277
std,0.106468,1828.049177,0.274432,1.384887,9.8872,0.449645,1.944424,0.479555,0.179228,21.600254
min,4.2,228.0,1.0,1.0,1.0,1.0,0.0,3.0,2.0,7.3
25%,4.2,379.5,1.0,1.0,4.0,1.0,14.0,4.0,3.0,31.6875
50%,4.3,653.0,1.0,2.0,9.0,1.0,14.5,4.0,3.0,44.98
75%,4.4,1339.25,1.0,3.0,16.75,2.0,14.5,5.0,3.0,63.5
max,4.7,16395.0,2.0,6.0,46.0,2.0,16.5,5.0,3.0,99.5


In [6]:
full_bodied = best_value[(best_value['alcohol'] >= 13.5)]
full_bodied.describe()

Unnamed: 0,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
count,279.0,279.0,279.0,279.0,279.0,279.0,279.0,264.0,264.0,279.0
mean,4.307527,1405.139785,1.0681,2.064516,11.691756,1.27957,14.449104,4.344697,2.965909,48.770358
std,0.108525,1875.240836,0.252371,1.350245,9.890678,0.449594,0.484052,0.484091,0.181807,21.620336
min,4.2,231.0,1.0,1.0,1.0,1.0,13.5,3.0,2.0,7.3
25%,4.2,395.5,1.0,1.0,4.0,1.0,14.0,4.0,3.0,31.45
50%,4.3,695.0,1.0,2.0,9.0,1.0,14.5,4.0,3.0,44.98
75%,4.4,1501.5,1.0,3.0,16.0,2.0,14.5,5.0,3.0,62.725
max,4.7,16395.0,2.0,6.0,46.0,2.0,16.5,5.0,3.0,99.5


In [7]:
# 88% of best-value wines are full-bodied.
# Any red wine with more than 13.5 percent alcohol is considered a full-bodied wine. 
# Full-bodied wines have more complex flavors and have a richer mouthfeel.

In [8]:
bv_prices = best_value['price']
bins = [7, 20, 30, 40, 50, 60, 70, 80, 90, 100]
cats = pd.cut(bv_prices, bins)
pd.value_counts(cats)

(30, 40]     63
(40, 50]     61
(20, 30]     57
(50, 60]     39
(60, 70]     24
(70, 80]     24
(90, 100]    20
(80, 90]     19
(7, 20]      11
Name: price, dtype: int64

In [9]:
pop_prices = best_value[(best_value['price'] >= 30) & (best_value['price'] <= 60)]
pop_prices.describe()

Unnamed: 0,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
count,163.0,163.0,163.0,163.0,163.0,163.0,155.0,156.0,156.0,163.0
mean,4.306748,1450.331288,1.09816,2.09816,12.601227,1.245399,13.930968,4.391026,2.967949,44.091104
std,0.093709,1781.914916,0.298447,1.370837,10.959262,0.431649,2.348765,0.489552,0.176704,8.848998
min,4.2,228.0,1.0,1.0,1.0,1.0,0.0,4.0,2.0,30.43
25%,4.2,389.0,1.0,1.0,4.0,1.0,14.0,4.0,3.0,36.3
50%,4.3,680.0,1.0,2.0,9.0,1.0,14.5,4.0,3.0,44.41
75%,4.4,1698.5,1.0,3.0,17.0,1.0,14.5,5.0,3.0,49.925
max,4.6,10267.0,2.0,6.0,46.0,2.0,16.0,5.0,3.0,59.95


In [10]:
# 51% of best-value wines cost betweeen 30 and 60 euros.

In [11]:
bv_wineries = best_value['winery'].value_counts()
bv_wineries.head()

Álvaro Palacios     18
La Rioja Alta       13
Clos Mogador        10
Remírezde Ganuza    10
Emilio Moro          9
Name: winery, dtype: int64

In [12]:
bv_vintages = best_value['vintage'].value_counts()
bv_vintages.head()

Rioja Reserva      16
Priorat            13
Finca Dofí         11
Riberadel Duero    10
Tinto               9
Name: vintage, dtype: int64

In [13]:
bv_years = best_value['year'].value_counts()
bv_years.head()

2017    53
2016    52
2015    40
2018    37
2014    29
Name: year, dtype: int64

In [14]:
bv_regions = best_value['region'].value_counts()
bv_regions.head()

Rioja              112
Riberadel Duero     80
Priorat             51
Toro                18
Castillay León       9
Name: region, dtype: int64

In [15]:
# 35% best-value wines are from Rioja.

In [16]:
for f in df['foods']:
    f.sort()

bv_foods = best_value['foods'].value_counts()
bv_foods.head()

[Beef, Lamb, Poultry, Veal]                    126
[Beef, Game (deer, venison), Lamb]              98
[Beef, Game (deer, venison), Lamb, Poultry]     57
[Beef, Pasta, Poultry, Veal]                    24
[Beef, Lamb, Poultry]                            4
Name: foods, dtype: int64

In [17]:
# 40% of best-value wines foods are "Beef, Lamb, Poultry, Veal".

In [18]:
bv_styles = best_value['style'].value_counts()
bv_styles.head()

Spanish Rioja Red               116
Spanish Ribera Del Duero Red     80
Spanish Priorat Red              51
Spanish Red                      24
Spanish Toro Red                 18
Name: style, dtype: int64

In [19]:
bv_acidity = best_value['acidity'].value_counts()
bv_acidity

3.0    292
2.0     10
Name: acidity, dtype: int64

In [20]:
# 91.8% of best-value wines have acidity of 3.

# Typically, the pH level of a wine ranges from 3 to 4. 
# Red wines with higher acidity are more likely to be a bright ruby color, 
# as the lower pH gives them a red hue. 
# Higher pH, less-acidic red wines can take on a blue or purple hue.

In [21]:
bv_grapes = best_value['grapes'].value_counts()
bv_grapes

Tempranillo           167
Garnacha               27
Shiraz/Syrah           24
Tinto Fino             17
Cariñena               12
Merlot                 10
Tinta de toro           9
Cabernet Sauvignon      8
Tinta del Pais          6
Grenache                4
Monastrell              3
Pinot Noir              2
Mourvedre               2
Graciano                2
Bobal                   2
Callet                  1
Garnacha Tintorera      1
Mencia                  1
Mazuelo                 1
Name: grapes, dtype: int64

In [22]:
# 52.5% of best-values wines are of Tempranillo grapes.