# Load in Libraries and Setup Plotting Environment

In [1]:
# data loading and manipulation
import os
import requests

import re
import pandas as pd
import numpy as np

In [2]:
# plotting
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.ticker import FuncFormatter
from matplotlib.colors import ListedColormap
%matplotlib inline

# figure aesthetics
sns.set(font_scale=1.5, style='whitegrid')

# Load in and Clean Data

## Attempt 1: Global Wine Score API

In [3]:
global_wine_score_path = '../data/global_wine_score.csv'
global_wine_score_df = None

# check if data is saved locally, otherwise use API
if os.path.isfile(global_wine_score_path):
    print('Data loaded from file {}'.format(global_wine_score_path))
    global_wine_score_df = pd.read_csv(global_wine_score_path)
else:
    wine_list_json = []

    api_url = 'https://api.globalwinescore.com/globalwinescores/latest/'
    limit = 10000
    offset = 0
    has_next = True

    while has_next:
        # fetch data from global winescore api
        print('Fetching records {} to {} from {}?limit={}&offset={}'.format(offset, offset + limit, api_url, limit, offset))
        response = requests.get(api_url,
                            params={'limit': limit, 'offset': offset},
                            headers={'Authorization': 'Token {}'.format(os.environ['GLOBAL_WINE_SCORE_API'])})

        # add to existing data list
        if response.status_code == 200: 
            # parse json from response
            response_json = response.json()

            # add data to existing list
            wine_list_json.extend(response_json['results'])

            # check if a next is available
            if response_json['next'] is not None:
                offset += limit
            else:
                has_next = False

    global_wine_score_df = pd.DataFrame.from_dict(wine_list_json, orient='columns')
    global_wine_score_df.to_csv(global_wine_score_path, index=False)

# show loaded dataframe
global_wine_score_df.head()

Data loaded from file ../data/global_wine_score.csv


Unnamed: 0,appellation,appellation_slug,classification,color,confidence_index,country,date,is_primeurs,journalist_count,lwin,lwin_11,regions,score,vintage,wine,wine_id,wine_slug,wine_type
0,Bonnes Mares Grand Cru,bonnes-mares-grand-cru,,Red,A,France,2019-05-31,False,5,1056789.0,10567890000.0,['Bourgogne'],96.3,2016,"Domaine Georges & Christophe Roumier, Bonnes M...",58794,domaine-georges-christophe-roumier-bonnes-mare...,
1,Puente Alto,puente-alto,,Red,A,Chile,2019-05-31,False,3,1083246.0,10832460000.0,['Chile'],95.63,2016,"Vina Almaviva, Puente Alto",140620,vina-almaviva-puente-alto,
2,Cote Rotie,cote-rotie,,Red,B+,France,2019-05-31,False,4,1111426.0,11114260000.0,['Rhone'],95.45,2016,"Delas Freres, La Landonne, Cote Rotie",49900,delas-freres-la-landonne-cote-rotie,
3,Hermitage,hermitage,,Red,A,France,2019-05-31,False,6,1111497.0,11114970000.0,['Rhone'],95.15,2016,"Delas Freres, Les Bessards, Hermitage",49902,delas-freres-les-bessards-hermitage,
4,Hermitage,hermitage,,Red,B,France,2019-05-31,False,4,,,['Rhone'],93.14,2015,"E. Guigal, Hermitage",68469,e-guigal-hermitage,


## Attempt 2: [Kaggle](https://www.kaggle.com/zynicide/wine-reviews#winemag-data-130k-v2.csv) Wine Review Data

In [4]:
def extract_vintage(wine_title):
    """
    Extracts vintage of a wine from a given wine title. 
    
    Inputs:
        wine_title (string): title for wine possibly containing vintage
    
    Output: 
        (int or None): year as int or None if title has no year
    """
    vintage_list = re.findall(r'\b\d{4}\b', wine_title)
    
    # check if there is one and only one year, and not a champagne
    if len(vintage_list) == 1 and not ' NV ' in wine_title:
        # check if newer than 1900
        vintage_year = int(vintage_list[0])
        if vintage_year >= 1900: 
            return vintage_year
    return None

In [5]:
raw_data_path = '../data/'
wine_130k_file = 'winemag-data-130k-v2.csv'

In [6]:
# convert csv to pandas df
wine_130k_df = pd.read_csv(raw_data_path + wine_130k_file, index_col=0)

# remove unneeded columns for taster
del wine_130k_df['taster_name'], wine_130k_df['taster_twitter_handle']

# add a vintage column
wine_130k_df = wine_130k_df.assign(vintage=wine_130k_df.title.apply(extract_vintage))

# remove any and all NAs
wine_130k_df.dropna(subset=['country', 'points', 'price', 'province', 'title', 'variety', 'winery', 'vintage'], inplace=True) # TODO: specify which cols to drop

# reset index 
wine_130k_df.reset_index(drop=True, inplace=True)
print('Number of records: {}'.format(len(wine_130k_df)))
wine_130k_df.head()

Number of records: 116322


Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,title,variety,winery,vintage
0,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011.0
1,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013.0
2,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013.0
3,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012.0
4,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,2011.0


In [7]:
len(wine_130k_df)

116322

In [8]:
wine_130k_df.describe()

Unnamed: 0,points,price,vintage
count,116322.0,116322.0,116322.0
mean,88.457024,35.504496,2010.733137
std,3.046195,41.256275,3.656193
min,80.0,4.0,1934.0
25%,86.0,17.0,2009.0
50%,88.0,25.0,2011.0
75%,91.0,42.0,2013.0
max,100.0,3300.0,2017.0


In [9]:
len(wine_130k_df.region_1.unique())

1195

In [10]:
wine_130k_df.country.unique()

array(['Portugal', 'US', 'Spain', 'Italy', 'France', 'Germany',
       'Argentina', 'Chile', 'Australia', 'Austria', 'South Africa',
       'New Zealand', 'Israel', 'Hungary', 'Greece', 'Romania', 'Mexico',
       'Canada', 'Turkey', 'Czech Republic', 'Slovenia', 'Croatia',
       'Georgia', 'Uruguay', 'England', 'Lebanon', 'Serbia', 'Brazil',
       'Moldova', 'Morocco', 'Peru', 'India', 'Bulgaria', 'Cyprus',
       'Armenia', 'Switzerland', 'Bosnia and Herzegovina', 'Slovakia',
       'Macedonia', 'Ukraine', 'Luxembourg', 'China'], dtype=object)

In [11]:
len(wine_130k_df.country.unique())

42

In [12]:
with open('../data/wine_titles.txt', 'w') as f:
    for wine_title in list(wine_130k_df.title.unique()):
        f.write("%s\n" % wine_title)

In [13]:
wine_130k_df.variety.unique()

array(['Portuguese Red', 'Pinot Gris', 'Riesling', 'Pinot Noir',
       'Tempranillo-Merlot', 'Frappato', 'Gewürztraminer',
       'Cabernet Sauvignon', 'Chardonnay', 'Malbec', 'Tempranillo Blend',
       'Meritage', 'Red Blend', 'White Blend', 'Merlot', "Nero d'Avola",
       'Chenin Blanc', 'Sauvignon Blanc', 'Viognier-Chardonnay',
       'Primitivo', 'Catarratto', 'Gamay', 'Inzolia', 'Petit Verdot',
       'Monica', 'Bordeaux-style White Blend', 'Grillo', 'Sangiovese',
       'Cabernet Franc', 'Bordeaux-style Red Blend', 'Aglianico',
       'Petite Sirah', 'Carmenère', 'Albariño', 'Petit Manseng', 'Rosé',
       'Zinfandel', 'Vernaccia', 'Rosato', 'Grüner Veltliner', 'Viognier',
       'Vermentino', 'Grenache Blanc', 'Syrah', 'Nebbiolo',
       'Shiraz-Cabernet Sauvignon', 'Pinot Blanc', 'Alsace white blend',
       'Barbera', 'Rhône-style Red Blend', 'Portuguese White', 'Graciano',
       'Tannat-Cabernet', 'Sauvignon', 'Torrontés', 'Prugnolo Gentile',
       'G-S-M', 'Verdejo', 'F

In [14]:
len(wine_130k_df.variety.unique())

681