# Prepare wines from a dirty list

Load a CSV of wines (aka winelist) and clean the data to get it ready to run it through the matching algorithm.


In [56]:
import pandas as pd

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load dirty winelist file and perform cleaning


In [57]:
wines = pd.read_csv('v1-cleaned.csv')
print(f'Total rows: {wines.shape[0]}')
print()

wines.head()


Total rows: 1159



Unnamed: 0,type,full_details,size,price,internal_info
0,,Prosecco,,,
1,SPARKLING,VEDOVA Extra Dry Superiore Millesimato 2022,0.75,€ 25.00,https://www.shop-cantinevedova.com/it/rive-mil...
2,SPARKLING,,0.75,,
3,SPARKLING,Franciacorta,0.75,,
4,SPARKLING,BELLAVISTA Grand Cuvèe ALMA Brut,0.75,€ 56.00,https://www.bellavistawine.it/Public/images/16...


We need the following fields:

- `name`
- `winery_name`
- `type`
- `area`
- `size`
- `vintage`
- `price` (parse to int)
- `info`
- `internal_notes`

---


### Note: This part is always CUSTOM to the client


Drop rows without full_details or without price.

I need to extract the some data from `full_details`, such as:
- name
- winery_name
- vintage

Observations for `full_details` field:
- winery and wine name are not separated, they are together
- vintage: consider the first YYYY
- extract what is between parenthesis and use as info

Other fields:
- size OK
- type OK
- area missing
- price OK but clean the euro symbol
- info is empty
- internal_notes is a link to technical wine details


In [58]:
# remove rows with no full_details or no price
wines = wines[wines['full_details'].notna()]
wines = wines[wines['price'].notna()]
print(f'Total rows: {wines.shape[0]}')

Total rows: 966


In [59]:
import re

# extract data from full_details
for idx, wine in wines.iterrows():
    # extract vintage as first YYYY adn remove it from full_details
    vintage = re.search(r'\b\d{4}\b', wine['full_details'])
    if vintage:
        wines.at[idx, 'vintage'] = vintage.group()
        wine['full_details'] = wine['full_details'].replace(
            vintage.group(), '')

    # extract content of parenthesis, use it as info and remove it from full_details
    # since it can be multiples, concat them with a | separator
    parenthesis = re.findall(r'\((.*?)\)', wine['full_details'])
    if parenthesis:
        wines.at[idx, 'info'] = ' | '.join(parenthesis)
        # for each group, remove it from full_details
        for group in parenthesis:
            wine['full_details'] = wine['full_details'].replace(
                f'({group})', '')

    # set wine and winery name as same value (remaining value of full_details)
    wines.at[idx, 'name'] = wine['full_details']
    wines.at[idx, 'winery_name'] = wine['full_details']

wines.head()


Unnamed: 0,type,full_details,size,price,internal_info,vintage,name,winery_name,info
1,SPARKLING,VEDOVA Extra Dry Superiore Millesimato 2022,0.75,€ 25.00,https://www.shop-cantinevedova.com/it/rive-mil...,2022.0,VEDOVA Extra Dry Superiore Millesimato,VEDOVA Extra Dry Superiore Millesimato,
4,SPARKLING,BELLAVISTA Grand Cuvèe ALMA Brut,0.75,€ 56.00,https://www.bellavistawine.it/Public/images/16...,,BELLAVISTA Grand Cuvèe ALMA Brut,BELLAVISTA Grand Cuvèe ALMA Brut,
5,SPARKLING,BELLAVISTA “RISERVA MORETTI” Extra Brut 2013 (...,0.75,€ 129.00,https://www.bellavistawine.it/Public/images/13...,2013.0,BELLAVISTA “RISERVA MORETTI” Extra Brut,BELLAVISTA “RISERVA MORETTI” Extra Brut,2 - 2004
6,SPARKLING,BELLAVISTA Saten 2015,0.75,€ 79.00,https://www.bellavistawine.it/Public/images/11...,2015.0,BELLAVISTA Saten,BELLAVISTA Saten,
7,SPARKLING,BERLUCCHI “61 NATURE” Brut 2015,0.75,€ 49.00,https://www.berlucchi.it/wp-content/uploads/20...,2015.0,BERLUCCHI “61 NATURE” Brut,BERLUCCHI “61 NATURE” Brut,


In [60]:
# add empty area
wines['area'] = ''

In [61]:
# clean sizes
print(wines['size'].unique())

[0.75  1.5   0.5   0.375]


In [62]:
# clean prices
wines['price'] = wines['price'].str.replace('€', '').str.replace(',', '.')

wines.head()


Unnamed: 0,type,full_details,size,price,internal_info,vintage,name,winery_name,info,area
1,SPARKLING,VEDOVA Extra Dry Superiore Millesimato 2022,0.75,25.0,https://www.shop-cantinevedova.com/it/rive-mil...,2022.0,VEDOVA Extra Dry Superiore Millesimato,VEDOVA Extra Dry Superiore Millesimato,,
4,SPARKLING,BELLAVISTA Grand Cuvèe ALMA Brut,0.75,56.0,https://www.bellavistawine.it/Public/images/16...,,BELLAVISTA Grand Cuvèe ALMA Brut,BELLAVISTA Grand Cuvèe ALMA Brut,,
5,SPARKLING,BELLAVISTA “RISERVA MORETTI” Extra Brut 2013 (...,0.75,129.0,https://www.bellavistawine.it/Public/images/13...,2013.0,BELLAVISTA “RISERVA MORETTI” Extra Brut,BELLAVISTA “RISERVA MORETTI” Extra Brut,2 - 2004,
6,SPARKLING,BELLAVISTA Saten 2015,0.75,79.0,https://www.bellavistawine.it/Public/images/11...,2015.0,BELLAVISTA Saten,BELLAVISTA Saten,,
7,SPARKLING,BERLUCCHI “61 NATURE” Brut 2015,0.75,49.0,https://www.berlucchi.it/wp-content/uploads/20...,2015.0,BERLUCCHI “61 NATURE” Brut,BERLUCCHI “61 NATURE” Brut,,


### This part is still CUSTOM but more or less it is needed for all onboardings

In [63]:
# drop duplicates
wines = wines.drop_duplicates()

print(f'Total rows: {wines.shape[0]}')

Total rows: 966


In [64]:
# Define the sizes-to-enum map
sizes = {
    0.1875: 'GLASS',
    0.375: 'HALF_BOTTLE',
    0.5: 'HALF_LITER',
    0.75: 'BOTTLE',
    1: 'LITER',
    1.5: 'MAGNUM',
    3: 'JEROBOAM',
    4.5: 'REHOBOAM',
    5: 'BORDEAUX_JEROBOAM',
    6: 'MATHUSALEM',
    9: 'SALMANAZAR',
    12: 'BALTHAZAR',
    15: 'NEBUCHADNEZZAR',
    18: 'MELCHIOR',
    20: 'SOLOMON',
    25: 'SOVEREIGN',
    27: 'GOLIATH',
    30: 'MELCHIZEDEK'
}

wines['size'] = wines['size'].astype(float).map(sizes)

### Save new version of the winelist

In [65]:
open('v2-cleaned.csv', 'w').close()
wines.drop(columns=['full_details'], inplace=True)
wines.to_csv('v2-cleaned.csv', index=False)

### Print search terms

**NOTE: Before proceeding with the matching algorithm, run the viviner and insert new wines.**

In [66]:
with open('search-terms.txt', 'w') as f:
    for term in wines['name'].unique():
        f.write(f'{term}\n')
    for term in wines['winery_name'].unique():
        f.write(f'{term}\n')