In [1]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
%matplotlib notebook

In [2]:
file = '../data/spain-red'
df = pd.read_csv(file + '.csv', converters={'foods': literal_eval, 'highlights': literal_eval})

In [3]:
diacritics = {'&#237;': 'í', '&#243;': 'ó', '&#250;': 'ú', '&#241;': 'ñ', '&#232;': 'è', \
    '&#8364;': '€', '&#193;': 'Á', '&#192;': 'À', '&#233;': 'é', '&#224;': 'à', \
    '&#239;': 'ï', '&#231;': 'ç', '&#193;': 'Á', '&#252;': 'ü', '&#225;': 'á', '&#186;': 'º', \
    '&#244;': 'ô', '&#8217;': '’', '&#960;': 'π', '&#210;': 'Ò'}
df.replace(diacritics, regex=True, inplace=True)
df['winery'] = df['winery'].apply(lambda x: re.sub(r'((?<=[a-z_à-ÿ])[A-Z]|(?<!\A)[A-Z](?=[a-z_à-ÿ]))', r' \1', x))
df['vintage'] = df['vintage'].apply(lambda x: re.sub(r'((?<=[a-z_à-ÿ])[A-Z]|(?<!\A)[A-Z](?=[a-z_à-ÿ]))', r' \1', x))
df['region'] = df['region'].apply(lambda x: re.sub(r'((?<=[a-z_à-ÿ])[A-Z]|(?<!\A)[A-Z](?=[a-z_à-ÿ]))', r' \1', x))
df['vintage'] = df['vintage'].apply(lambda x: re.sub(r"(\S)\(", r'\1 (', x))
df['vintage'] = df['vintage'].apply(lambda x: re.sub('\(\s*(.*?)\s*\)', r'(\1)', x))
df['vintage'] = df['vintage'].apply(lambda x: re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", x).strip())
df['vintage'] = df['vintage'].replace(r'\s+', ' ', regex=True)

numeric_columns = ['year', 'rating', 'ratings_count', 'country_rank', 'region_rank', \
                   'winery_rank', 'global_rank', 'alcohol', 'body', 'acidity', 'price']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

df = df.drop(['drink_from', 'drink_until'], axis = 1)

In [4]:
plot_color = '#ad1a33'

In [5]:
highest_prices = df.sort_values(by='price', ascending=False)[['winery', 'vintage', 'year', 'price']].head(10)
highest_prices

Unnamed: 0,winery,vintage,year,price
542,Clos Erasmus,Priorat,2003.0,3404.94
710,Dominiode Pingus,Pingus,2004.0,3392.24
656,Descendientesde J. Palacios,La Faraona Bierzo (Corullón),2010.0,3119.08
712,Dominiode Pingus,Pingus,2012.0,2693.46
706,Dominiode Pingus,Pingus,2014.0,2655.35
709,Dominiode Pingus,Pingus,2001.0,1842.23
707,Dominiode Pingus,Pingus,1996.0,1746.94
714,Dominiode Pingus,Pingus,1999.0,1715.18
715,Dominiode Pingus,Pingus,2000.0,1715.18
717,Dominiode Pingus,Pingus,2015.0,1683.41


In [6]:
lowest_prices = df.sort_values(by='price', ascending=True)[['winery', 'vintage', 'year', 'price']].head(10)
lowest_prices

Unnamed: 0,winery,vintage,year,price
818,Más Que Vinos,Ercavio Tempranillo Roble,2015.0,4.9
334,Pinord,Closde Torribas Crianza Tempranillo,2017.0,5.39
1998,Rio Lindo,Syrah,2020.0,5.95
389,Caminode Seda,Selección Especial Jumilla,2020.0,5.99
1158,La Sastrería,Garnacha Roble,2020.0,6.45
340,Bodegas Piqueras,Castillode Almansa Colección Garnacha Tintorera,2020.0,6.55
591,Condadode Oriza,Tempranillo Riberadel Duero,2019.0,6.95
1838,Torre Oria,Casablanca Viñas Viejas Monastrell,2019.0,6.95
339,Bodegas Piqueras,Black Label Syrah- Monastrell,2019.0,6.95
874,EGO,Gorú,2019.0,6.99


In [7]:
highest_ranking = df[df['rating'] >= 4.8][['winery', 'vintage', 'year', 'rating']]
highest_ranking.sort_values(by='rating', ascending=False)

Unnamed: 0,winery,vintage,year,rating
166,Artadi,Viña El Pison,2018.0,4.9
88,Álvaro Palacios,L' Ermita Velles Vinyes Priorat,2017.0,4.8
1811,Teso La Monja,Tinto,2012.0,4.8
1894,Vega Sicilia,Unico Reserva Especial Edición,2005.0,4.8
1893,Vega Sicilia,Unico Reserva Especial Edición,2020.0,4.8
1891,Vega Sicilia,Unico Reserva Especial Edición,2015.0,4.8
1889,Vega Sicilia,Unico Reserva Especial Edición,2016.0,4.8
1888,Vega Sicilia,Unico,2010.0,4.8
1886,Vega Sicilia,Unico,1996.0,4.8
1884,Vega Sicilia,Unico,1995.0,4.8


In [8]:
lowest_ranking = df[df['rating'] < 3.85][['winery', 'vintage', 'year', 'rating']]
lowest_ranking.sort_values(by='rating', ascending=True)

Unnamed: 0,winery,vintage,year,rating
60,Altos Ibéricos,Reserva,2015.0,3.7
34,Alfredo Arribas,Gotesdel Priorat,2017.0,3.8
1292,Martin Berdugo,Crianza,2015.0,3.8
1295,Martinez Lacuesta,Rioja Crianza,2016.0,3.8
1296,Mas Asturias,Massuria,2012.0,3.8
...,...,...,...,...
675,Vivanco,Crianza Rioja,2017.0,3.8
687,Dominiode Atauta,Paradade Atauta,2016.0,3.8
751,El Coto,Cotode Imaz Rioja Reserva,2015.0,3.8
753,El Coto,Cotode Imaz Rioja Reserva,2017.0,3.8


In [9]:
highest_alcohol = df[df['alcohol'] >= 16][['winery', 'vintage', 'year', 'alcohol']]
highest_alcohol.sort_values(by='alcohol', ascending=False)

Unnamed: 0,winery,vintage,year,alcohol
1118,Juan Gil,Jumilla Blue Label,2018.0,16.5
54,Alto Moncayo,Aquilon Garnacha,2015.0,16.0
55,Alto Moncayo,Garnacha,2017.0,16.0
737,Dominiodel Bendito,Las Sabias,2017.0,16.0
940,Espectacledel Montsant,Tinto,2016.0,16.0
1417,Olivares,Dulce Monastrell,2016.0,16.0
1581,Quintadela Quietud,La Muladela Quietud,2015.0,16.0


In [10]:
lowest_alcohol = df[((df['alcohol'] <= 12.5) & (df['alcohol'] != 0))][['winery', 'vintage', 'year', 'alcohol']]
lowest_alcohol.sort_values(by='alcohol', ascending=True)

Unnamed: 0,winery,vintage,year,alcohol
1020,4 Kilos Vinícola,4 Kilos,2019.0,12.0
971,Fedellosdo Couto,Bastarda,2019.0,12.0
1413,Bodegas Olarra,Cerro Añon Crianza Rioja,2017.0,12.0
451,Castillo Clavijo,Rioja Gran Reserva,2011.0,12.0
1599,R. Lópezde Heredia,Viña Tondonia Reserva,1973.0,12.0
1152,La Rioja Alta,Viña Arana Reserva,2012.0,12.0
1021,4 Kilos Vinícola,4 Kilos,2017.0,12.0
1733,Envínate,Táganan Tinto,2017.0,12.0
1601,R. Lópezde Heredia,Viña Tondonia Reserva,2009.0,12.5
1626,Raúl Pérez,Ultreia Mencía,2017.0,12.5


In [11]:
top_global = df[df['global_rank'] <= 1][['winery', 'vintage', 'year', 'global_rank']]
top_global.sort_values(by='winery', ascending=False)

Unnamed: 0,winery,vintage,year,global_rank
823,Áster,Finca El Otero,2016.0,1
95,Álvaro Palacios,L' Ermita Velles Vinyes Priorat,2018.0,1
87,Álvaro Palacios,L' Ermita Velles Vinyes Priorat,1995.0,1
70,Álvaro Palacios,Finca Dofí,2010.0,1
71,Álvaro Palacios,Finca Dofí,2007.0,1
...,...,...,...,...
4,Aalto,PS (Pagos Seleccionados) Riberadel Duero,2010.0,1
3,Aalto,PS (Pagos Seleccionados) Riberadel Duero,2009.0,1
1,Aalto,PS (Pagos Seleccionados) Riberadel Duero,2019.0,1
2,Aalto,PS (Pagos Seleccionados) Riberadel Duero,2018.0,1


In [12]:
top_country = df[df['country_rank'] <= 1][['winery', 'vintage', 'year', 'country_rank']]
top_country.sort_values(by='winery', ascending=False)

Unnamed: 0,winery,vintage,year,country_rank
823,Áster,Finca El Otero,2016.0,1
85,Álvaro Palacios,L' Ermita Velles Vinyes Priorat,2015.0,1
93,Álvaro Palacios,L' Ermita Velles Vinyes Priorat,1998.0,1
77,Álvaro Palacios,Finca Dofí,2015.0,1
79,Álvaro Palacios,Finca Dofí,2017.0,1
...,...,...,...,...
1,Aalto,PS (Pagos Seleccionados) Riberadel Duero,2019.0,1
4,Aalto,PS (Pagos Seleccionados) Riberadel Duero,2010.0,1
3,Aalto,PS (Pagos Seleccionados) Riberadel Duero,2009.0,1
2,Aalto,PS (Pagos Seleccionados) Riberadel Duero,2018.0,1


In [13]:
top_region = df[df['region_rank'] <= 1][['winery', 'vintage', 'year', 'region_rank']]
top_region.sort_values(by='winery', ascending=False)

Unnamed: 0,winery,vintage,year,region_rank
93,Álvaro Palacios,L' Ermita Velles Vinyes Priorat,1998.0,1
92,Álvaro Palacios,L' Ermita Velles Vinyes Priorat,2004.0,1
85,Álvaro Palacios,L' Ermita Velles Vinyes Priorat,2015.0,1
86,Álvaro Palacios,L' Ermita Velles Vinyes Priorat,2005.0,1
87,Álvaro Palacios,L' Ermita Velles Vinyes Priorat,1995.0,1
...,...,...,...,...
1,Aalto,PS (Pagos Seleccionados) Riberadel Duero,2019.0,1
3,Aalto,PS (Pagos Seleccionados) Riberadel Duero,2009.0,1
4,Aalto,PS (Pagos Seleccionados) Riberadel Duero,2010.0,1
7,Aalto,Tinto,2018.0,1


In [14]:
oldest_wines = df[df['year'] < 1980][['winery', 'vintage', 'year']]
oldest_wines.sort_values(by='year', ascending=True)

Unnamed: 0,winery,vintage,year
287,Bodegas Faustino,I Gran Reserva,1964.0
592,Condede Los Andes,Rioja Gran Reserva,1964.0
1604,R. Lópezde Heredia,Viña Tondonia Reserva,1964.0
1871,Vega Sicilia,Unico,1965.0
1876,Vega Sicilia,Unico,1967.0
304,Bodegas Franco- Españolas,Royal,1970.0
1264,Marquésde Murrieta,Castillo Ygay Gran Reserva Especial Tinto,1970.0
1535,Viña Pomal,Reserva,1970.0
1874,Vega Sicilia,Unico,1972.0
1599,R. Lópezde Heredia,Viña Tondonia Reserva,1973.0


In [19]:
h_prices = df[(df['price'].notna())].sort_values(by='price', ascending=False).head(100)
h_prices.describe()

Unnamed: 0,year,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,86.0,96.0,96.0,100.0
mean,2004.91,4.628,421.41,1.17,1.83,11.74,1.17,14.097674,4.489583,2.916667,868.7033
std,11.942704,0.142191,753.686309,1.119659,3.357654,10.064057,1.119659,0.614908,0.502516,0.277836,651.925496
min,1965.0,4.1,25.0,1.0,1.0,1.0,1.0,12.5,4.0,2.0,288.15
25%,1999.0,4.6,66.75,1.0,1.0,4.0,1.0,13.625,4.0,3.0,443.29
50%,2007.5,4.6,129.5,1.0,1.0,10.0,1.0,14.0,4.0,3.0,643.125
75%,2014.0,4.7,317.75,1.0,1.0,17.0,1.0,14.5,5.0,3.0,1040.6
max,2020.0,4.8,3442.0,12.0,32.0,60.0,12.0,15.4,5.0,3.0,3404.94


In [21]:
l_prices = df[(df['price'].notna())].sort_values(by='price', ascending=True).head(100)
l_prices.describe()

Unnamed: 0,year,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,77.0,96.0,96.0,100.0
mean,2018.05,3.915,859.17,6.75,6.96,13.57,7.14,14.051948,4.333333,2.9375,8.9082
std,1.59782,0.111351,2205.768956,3.726441,5.082988,10.958374,3.428991,0.571151,0.536395,0.243332,1.229041
min,2013.0,3.8,26.0,1.0,1.0,1.0,2.0,12.0,3.0,2.0,4.9
25%,2017.0,3.8,115.0,5.0,3.0,5.0,5.0,14.0,4.0,3.0,8.1875
50%,2018.0,3.9,272.0,6.0,6.0,10.0,6.0,14.0,4.0,3.0,8.95
75%,2019.0,3.9,646.25,7.0,9.0,19.25,8.0,14.5,5.0,3.0,9.95
max,2020.0,4.3,19810.0,21.0,23.0,46.0,20.0,15.5,5.0,3.0,10.75


In [None]:
# Top-100 wines with the highest prices have a median rating of 4.6.
# Top-100 wines with the lowest prices have a median rating of 3.9.

# Top-100 wines with the highest prices are older: 2007 is the meadian year. 
# Top-100 wines with the lowest prices are younger: 2018 is the meadian year. 
# (Although the overall year/rating correlation is weak (-0.3).)

In [23]:
h_alcohol = df[(df['alcohol'].notna())].sort_values(by='alcohol', ascending=False).head(100)
h_alcohol.describe()

Unnamed: 0,year,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,93.0,93.0,100.0
mean,2015.66,4.206,529.09,3.25,4.92,18.53,3.42,15.282,4.473118,2.989247,68.0752
std,2.843458,0.238607,821.073664,2.979306,4.854822,16.649358,2.978645,0.325167,0.501983,0.103695,164.549668
min,2003.0,3.8,26.0,1.0,1.0,1.0,1.0,15.0,4.0,2.0,9.5
25%,2015.0,4.0,96.25,1.0,1.75,5.0,1.0,15.0,4.0,3.0,22.95
50%,2016.0,4.2,198.5,2.0,3.0,13.5,2.0,15.05,4.0,3.0,35.72
75%,2017.0,4.4,708.25,4.25,7.25,29.25,5.0,15.5,5.0,3.0,68.49
max,2020.0,4.8,5294.0,15.0,33.0,67.0,15.0,16.5,5.0,3.0,1638.95


In [24]:
l_alcohol = df[(df['alcohol'].notna())].sort_values(by='alcohol', ascending=True).head(100)
l_alcohol.describe()

Unnamed: 0,year,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,93.0,93.0,100.0
mean,2012.27,4.122,901.65,4.27,5.15,15.19,4.49,10.32,4.021505,2.946237,101.4598
std,8.539693,0.25409,1804.565514,3.923138,5.015884,11.500281,3.844174,5.03248,0.488531,0.226773,224.042479
min,1964.0,3.8,26.0,1.0,1.0,1.0,1.0,0.0,3.0,2.0,5.39
25%,2010.0,3.9,112.0,1.0,2.0,6.0,1.0,12.0,4.0,3.0,19.2475
50%,2015.0,4.0,234.5,4.0,4.0,13.0,4.0,12.5,4.0,3.0,39.99
75%,2017.0,4.3,761.75,5.0,6.25,21.25,6.0,13.0,4.0,3.0,59.6125
max,2020.0,4.8,10682.0,19.0,27.0,49.0,18.0,13.0,5.0,3.0,1715.18


In [25]:
h_year = df[(df['year'].notna())].sort_values(by='year', ascending=False).head(100)
h_year.describe()

Unnamed: 0,year,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,84.0,94.0,94.0,100.0
mean,2019.48,4.025,416.54,5.8,7.61,21.61,6.02,14.055952,4.457447,2.989362,27.6782
std,0.502117,0.206645,667.963019,3.805897,5.848068,14.544877,3.665234,1.661713,0.580413,0.103142,58.053512
min,2019.0,3.8,28.0,1.0,1.0,1.0,1.0,0.0,3.0,2.0,5.95
25%,2019.0,3.9,70.75,3.0,3.0,10.0,4.0,14.0,4.0,3.0,9.7425
50%,2019.0,4.0,156.5,5.0,6.0,21.0,5.5,14.0,4.5,3.0,15.1
75%,2020.0,4.1,538.75,7.0,11.0,31.0,8.0,14.5,5.0,3.0,24.6375
max,2020.0,4.8,4922.0,21.0,29.0,63.0,20.0,15.5,5.0,3.0,547.33


In [26]:
l_year = df[(df['year'].notna())].sort_values(by='year', ascending=True).head(100)
l_year.describe()

Unnamed: 0,year,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,49.0,96.0,96.0,100.0
mean,1993.28,4.386,235.51,3.11,5.44,18.28,3.15,13.6,4.291667,2.96875,395.5232
std,11.056403,0.240378,435.070771,3.752158,7.381823,14.416657,3.758828,0.694022,0.4794,0.174906,535.368848
min,1964.0,3.9,25.0,1.0,1.0,1.0,1.0,12.0,3.0,2.0,14.5
25%,1989.0,4.2,48.5,1.0,1.0,6.75,1.0,13.5,4.0,3.0,66.725
50%,1997.0,4.4,109.0,1.0,2.0,15.5,1.0,13.5,4.0,3.0,197.44
75%,2001.0,4.6,187.75,3.0,7.0,28.0,3.0,14.0,5.0,3.0,443.29
max,2004.0,4.8,2655.0,17.0,34.0,60.0,17.0,15.4,5.0,3.0,3404.94


In [None]:
# Top-100 oldest wines have a median price of 197.44.
# Top-100 youngest wines have a median price of 15.10.
# (Although the overall year/price correlation is weak (-0.3).)

In [32]:
h_rating = df[(df['rating'].notna())].sort_values(by='rating', ascending=False).head(100)
h_rating.describe()

Unnamed: 0,year,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,87.0,98.0,98.0,100.0
mean,2008.73,4.691,461.88,1.01,1.24,8.58,1.01,14.04023,4.510204,2.908163,562.9597
std,10.212742,0.076667,834.758675,0.1,0.740188,7.803366,0.1,1.644615,0.522581,0.29028,589.477136
min,1972.0,4.6,25.0,1.0,1.0,1.0,1.0,0.0,3.0,2.0,24.95
25%,2005.0,4.6,56.75,1.0,1.0,2.0,1.0,14.0,4.0,3.0,175.5
50%,2012.0,4.7,112.5,1.0,1.0,6.5,1.0,14.5,5.0,3.0,440.145
75%,2016.0,4.7,300.75,1.0,1.0,12.25,1.0,14.5,5.0,3.0,696.1725
max,2020.0,4.9,4335.0,2.0,5.0,33.0,2.0,16.0,5.0,3.0,3404.94


In [33]:
l_rating = df[(df['rating'].notna())].sort_values(by='rating', ascending=True).head(100)
l_rating.describe()

Unnamed: 0,year,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,93.0,99.0,99.0,100.0
mean,2016.94,3.799,1051.06,7.49,10.11,22.24,8.19,14.11828,4.181818,2.939394,15.4433
std,1.722225,0.01,1443.017264,1.74943,5.004836,14.067026,1.481843,0.543728,0.522233,0.239821,6.539975
min,2011.0,3.7,31.0,7.0,3.0,1.0,7.0,12.5,3.0,2.0,4.9
25%,2016.0,3.8,371.5,7.0,7.0,12.0,8.0,14.0,4.0,3.0,11.74
50%,2017.0,3.8,624.0,7.0,9.0,19.0,8.0,14.0,4.0,3.0,14.305
75%,2018.0,3.8,1109.5,7.25,13.0,28.25,8.0,14.5,4.0,3.0,16.95
max,2020.0,3.8,10396.0,21.0,24.0,59.0,20.0,15.0,5.0,3.0,50.31


In [None]:
# Top-10 wines with the highest ratings have a median price of 440.14.
# Top-10 wines with the lowest ratings have a median price of 14.30.