# Prepare wines from a dirty list

Load a CSV of wines (aka winelist) and clean the data to get it ready to run it through the matching algorithm.


In [1]:
import pandas as pd

%load_ext autoreload
%autoreload 2


### Load dirty winelist file and perform cleaning


In [2]:
wines = pd.read_csv('v1-cleaned.csv')
print(f'Total rows: {wines.shape[0]}')
print()

wines.head()


Total rows: 364



Unnamed: 0,type,full_details,area,price
0,SPARKLING,BARONE PIZZINI – FRANCIACORTA BRUT SATEN D.O.C...,2-2,€ 65.00
1,SPARKLING,BARONE PIZZINI – FRANCIACORTA DOSAGGIO ZERO D....,2-2,€ 50.00
2,SPARKLING,BARONE PIZZINI – FRANCIACORTA EXTRA BRUT GOLF ...,2-2,€ 50.00
3,SPARKLING,BARONE PIZZINI – FRANCIACORTA ROSE' D.O.C.G. E...,2-2,€ 70.00
4,SPARKLING,BELLAVISTA – FRANCIACORTA BRUT D.O.C.G. CUVÉE ...,2-5,€ 60.00


We need the following fields:

- `name`
- `winery_name`
- `type`
- `area`
- `size`
- `vintage`
- `price`
- `info`
- `internal_notes`

---


### Note: This part is always CUSTOM to the client


I need to extract the some data from `full_details`, such as:
- name
- winery_name
- vintage
- size

Observations for `full_details` field:
- size is withtin parenthesis and is the last interesting parameter of the line
- winery and name are separated by `–` or `-`
- vintage is not always present

Other fields:
- type OK
- area OK
- price OK but clean the euro symbol
- info is empty
- internal_notes is empty


In [3]:
import re

# extract data from full_details
for idx, wine in wines.iterrows():
    # get winery name
    full_details = wine['full_details'].replace('–', ' - ')
    full_details = re.sub(r'\(\D+\)', '', full_details)
    full_details_tokens = full_details.split('-', maxsplit=1)

    wines.at[idx, 'winery_name'] = full_details_tokens[0].strip()

    if len(full_details_tokens) < 2:
        print(f'No extensive string found for {full_details}')
        continue

    # get year, if present
    year = ''
    match = re.search(r'\b\d{4}\b', full_details_tokens[1])
    if match:
        year = match.group()
        wines.at[idx, 'vintage'] = match.group()

    # get size of the bottle
    size = re.search(r'\((.*?)\)', full_details_tokens[1]).group(1)
    wines.at[idx, 'size'] = size.replace('l', '').replace('L', '').replace(',','.').strip()

    # get wine name
    wines.at[idx, 'name'] = full_details_tokens[1].replace(
        year, '').replace('-', '').split('(' + size)[0].strip()

wines.head()


Unnamed: 0,type,full_details,area,price,winery_name,vintage,size,name
0,SPARKLING,BARONE PIZZINI – FRANCIACORTA BRUT SATEN D.O.C...,2-2,€ 65.00,BARONE PIZZINI,2018.0,0.75,FRANCIACORTA BRUT SATEN D.O.C.G. BIO
1,SPARKLING,BARONE PIZZINI – FRANCIACORTA DOSAGGIO ZERO D....,2-2,€ 50.00,BARONE PIZZINI,,0.75,FRANCIACORTA DOSAGGIO ZERO D.O.C.G. ANIMANTE
2,SPARKLING,BARONE PIZZINI – FRANCIACORTA EXTRA BRUT GOLF ...,2-2,€ 50.00,BARONE PIZZINI,1927.0,0.75,FRANCIACORTA EXTRA BRUT GOLF
3,SPARKLING,BARONE PIZZINI – FRANCIACORTA ROSE' D.O.C.G. E...,2-2,€ 70.00,BARONE PIZZINI,2018.0,0.75,FRANCIACORTA ROSE' D.O.C.G. EXTRA BRUTMILLESIM...
4,SPARKLING,BELLAVISTA – FRANCIACORTA BRUT D.O.C.G. CUVÉE ...,2-5,€ 60.00,BELLAVISTA,,0.75,FRANCIACORTA BRUT D.O.C.G. CUVÉE ALMA


In [4]:
# clean sizes
print(wines['size'].unique())

wines['size'] = wines['size'].replace('O.75', '0.75')
wines['size'] = wines['size'].replace('A.O.C. 0.75', '0.75')

print(wines['size'].unique())

['0.75' '1.5' 'A.O.C. 0.75' 'O.75']
['0.75' '1.5']


In [5]:
# clean prices
wines['price'] = wines['price'].str.replace('€', '').str.replace(',', '.')

wines.head()


Unnamed: 0,type,full_details,area,price,winery_name,vintage,size,name
0,SPARKLING,BARONE PIZZINI – FRANCIACORTA BRUT SATEN D.O.C...,2-2,65.0,BARONE PIZZINI,2018.0,0.75,FRANCIACORTA BRUT SATEN D.O.C.G. BIO
1,SPARKLING,BARONE PIZZINI – FRANCIACORTA DOSAGGIO ZERO D....,2-2,50.0,BARONE PIZZINI,,0.75,FRANCIACORTA DOSAGGIO ZERO D.O.C.G. ANIMANTE
2,SPARKLING,BARONE PIZZINI – FRANCIACORTA EXTRA BRUT GOLF ...,2-2,50.0,BARONE PIZZINI,1927.0,0.75,FRANCIACORTA EXTRA BRUT GOLF
3,SPARKLING,BARONE PIZZINI – FRANCIACORTA ROSE' D.O.C.G. E...,2-2,70.0,BARONE PIZZINI,2018.0,0.75,FRANCIACORTA ROSE' D.O.C.G. EXTRA BRUTMILLESIM...
4,SPARKLING,BELLAVISTA – FRANCIACORTA BRUT D.O.C.G. CUVÉE ...,2-5,60.0,BELLAVISTA,,0.75,FRANCIACORTA BRUT D.O.C.G. CUVÉE ALMA


In [6]:
wines['info'] = ''
wines['internal_notes'] = ''

wines.head()

Unnamed: 0,type,full_details,area,price,winery_name,vintage,size,name,details,internal_notes
0,SPARKLING,BARONE PIZZINI – FRANCIACORTA BRUT SATEN D.O.C...,2-2,65.0,BARONE PIZZINI,2018.0,0.75,FRANCIACORTA BRUT SATEN D.O.C.G. BIO,,
1,SPARKLING,BARONE PIZZINI – FRANCIACORTA DOSAGGIO ZERO D....,2-2,50.0,BARONE PIZZINI,,0.75,FRANCIACORTA DOSAGGIO ZERO D.O.C.G. ANIMANTE,,
2,SPARKLING,BARONE PIZZINI – FRANCIACORTA EXTRA BRUT GOLF ...,2-2,50.0,BARONE PIZZINI,1927.0,0.75,FRANCIACORTA EXTRA BRUT GOLF,,
3,SPARKLING,BARONE PIZZINI – FRANCIACORTA ROSE' D.O.C.G. E...,2-2,70.0,BARONE PIZZINI,2018.0,0.75,FRANCIACORTA ROSE' D.O.C.G. EXTRA BRUTMILLESIM...,,
4,SPARKLING,BELLAVISTA – FRANCIACORTA BRUT D.O.C.G. CUVÉE ...,2-5,60.0,BELLAVISTA,,0.75,FRANCIACORTA BRUT D.O.C.G. CUVÉE ALMA,,


### This part is still CUSTOM but more or less it is needed for all onboardings

In [7]:
# drop duplicates
wines = wines.drop_duplicates()

print(f'Total rows: {wines.shape[0]}')

Total rows: 364


In [8]:
# Define the sizes-to-enum map
sizes = {
    0.1875: 'GLASS',
    0.375: 'HALF_BOTTLE',
    0.5: 'HALF_LITER',
    0.75: 'BOTTLE',
    1: 'LITER',
    1.5: 'MAGNUM',
    3: 'JEROBOAM',
    4.5: 'REHOBOAM',
    5: 'BORDEAUX_JEROBOAM',
    6: 'MATHUSALEM',
    9: 'SALMANAZAR',
    12: 'BALTHAZAR',
    15: 'NEBUCHADNEZZAR',
    18: 'MELCHIOR',
    20: 'SOLOMON',
    25: 'SOVEREIGN',
    27: 'GOLIATH',
    30: 'MELCHIZEDEK'
}

wines['size'] = wines['size'].astype(float).map(sizes)

### Save new version of the winelist

In [9]:
open('v2-cleaned.csv', 'w').close()
wines.drop(columns=['full_details'], inplace=True)
wines.to_csv('v2-cleaned.csv', index=False)

### Print search terms

**NOTE: Before proceeding with the matching algorithm, run the viviner and insert new wines.**

In [10]:
with open('search-terms.txt', 'w') as f:
    for term in wines['name'].unique():
        f.write(f'{term}\n')
    for term in wines['winery_name'].unique():
        f.write(f'{term}\n')