# Prepare wines from a dirty list

Load a CSV of wines (aka winelist) and clean the data to get it ready to run it through the matching algorithm.


In [70]:
import pandas as pd

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load dirty winelist file and perform cleaning


In [71]:
wines = pd.read_csv('v1-cleaned.csv')
print(f'Total rows: {wines.shape[0]}')
print()

wines.head()


Total rows: 2313



Unnamed: 0,external_id,full_details,type,quantity,internal_notes,price
0,v.10389.0.c,"Benoît Beaufort, Blanc de Blancs Brut Grand Cr...",SPARKLING,1,40,180.0
1,v.61401.0.b,"Bernard Lonclas, Blanc de Blancs Brut, 750 ml",SPARKLING,4,24,80.0
2,v.31055.0.b,"Bouquin-Dupont, Blanc de Blancs Brut Grand Cru...",SPARKLING,2,26,90.0
3,v.48127.0.c,"Bruno Paillard, Blanc de Blancs Brut Grand Cru...",SPARKLING,1,60,300.0
4,v.82256.1999.b,"Claude Cazals, Blanc de Blancs Brut Grand Cru,...",SPARKLING,2,20,140.0


We need the following fields:

- `external_id`
- `name`
- `winery_name`
- `type`
- `storage_area`
- `size`
- `vintage`
- `price`
- `info`
- `quantity`
- `internal_notes`

---


### Note: This part is always CUSTOM to the client


Drop rows without full_details or without price.

I need to extract some data from `full_details`, such as:
- name
- winery_name
- vintage
- size
- details

Observations for `full_details` field:
- degorgement in ()
- winery and wine name: _WINERY_, _WINE_, _other info_

Observations for `external_id` field:
- _type_._wine_id_._vintage_._size_
- extract vintage (0 for none)
- extract size -> compare it with _other info_ in `full_details`

Other fields:
- external_id OK
- type OK
- area missing
- price in euros
- quantity OK
- internal_notes contains the purchase price. Add a label


In [72]:
import re

# extract data from full_details
for idx, wine in wines.iterrows():
    # extract degorgement year in wine name (between parentheses) and remove it from full_details
    parenthesis = re.findall(r'\((.*?)\)', wine['full_details'])
    if len(parenthesis) > 0:
        wines.at[idx, 'info'] = ' | '.join(parenthesis)
        # for each group, remove it from full_details
        for group in parenthesis:
            wine['full_details'] = wine['full_details'].replace(
                f'({group})', '')
    else :
        wines.at[idx, 'info'] = ''
            
    # extract wine, winery name, and size from full_details
    full_details_tokens = wine['full_details'].split(',')
    wines.at[idx, 'winery_name'] = full_details_tokens[0]
    wines.at[idx, 'name'] = full_details_tokens[1]
    wines.at[idx, 'text_size'] = full_details_tokens[len(full_details_tokens) - 1]
    
    # extract vintage and size from external_id
    external_id_tokens = wine['external_id'].split('.')
    wines.at[idx, 'vintage'] = external_id_tokens[2]
    wines.at[idx, 'size'] = external_id_tokens[3]

wines.head()


Unnamed: 0,external_id,full_details,type,quantity,internal_notes,price,info,winery_name,name,text_size,vintage,size
0,v.10389.0.c,"Benoît Beaufort, Blanc de Blancs Brut Grand Cr...",SPARKLING,1,40,180.0,,Benoît Beaufort,Blanc de Blancs Brut Grand Cru,1500 ml,0,c
1,v.61401.0.b,"Bernard Lonclas, Blanc de Blancs Brut, 750 ml",SPARKLING,4,24,80.0,,Bernard Lonclas,Blanc de Blancs Brut,750 ml,0,b
2,v.31055.0.b,"Bouquin-Dupont, Blanc de Blancs Brut Grand Cru...",SPARKLING,2,26,90.0,,Bouquin-Dupont,Blanc de Blancs Brut Grand Cru,750 ml,0,b
3,v.48127.0.c,"Bruno Paillard, Blanc de Blancs Brut Grand Cru...",SPARKLING,1,60,300.0,,Bruno Paillard,Blanc de Blancs Brut Grand Cru,1500 ml,0,c
4,v.82256.1999.b,"Claude Cazals, Blanc de Blancs Brut Grand Cru,...",SPARKLING,2,20,140.0,,Claude Cazals,Blanc de Blancs Brut Grand Cru,750 ml,1999,b


In [73]:
# add empty area
wines['storage_area'] = ''

# set 0 vintage year to None
wines['vintage'] = wines['vintage'].apply(lambda x: None if int(x) == 0 else x)

# add label to purchase price in internal_notes
wines['internal_notes'] = wines['internal_notes'].apply(lambda x: f'Purchase Price: {x}')

In [74]:
# clean sizes
print(wines['size'].unique())
print(wines['text_size'].unique())

['c' 'b' 'd' 'a' 'm' 'o']
[' 1500 ml' ' 750 ml' ' 3000 ml' ' 375 ml' ' 500 ml' ' 1000 ml']


In [75]:
wines.head()


Unnamed: 0,external_id,full_details,type,quantity,internal_notes,price,info,winery_name,name,text_size,vintage,size,storage_area
0,v.10389.0.c,"Benoît Beaufort, Blanc de Blancs Brut Grand Cr...",SPARKLING,1,Purchase Price: 40,180.0,,Benoît Beaufort,Blanc de Blancs Brut Grand Cru,1500 ml,,c,
1,v.61401.0.b,"Bernard Lonclas, Blanc de Blancs Brut, 750 ml",SPARKLING,4,Purchase Price: 24,80.0,,Bernard Lonclas,Blanc de Blancs Brut,750 ml,,b,
2,v.31055.0.b,"Bouquin-Dupont, Blanc de Blancs Brut Grand Cru...",SPARKLING,2,Purchase Price: 26,90.0,,Bouquin-Dupont,Blanc de Blancs Brut Grand Cru,750 ml,,b,
3,v.48127.0.c,"Bruno Paillard, Blanc de Blancs Brut Grand Cru...",SPARKLING,1,Purchase Price: 60,300.0,,Bruno Paillard,Blanc de Blancs Brut Grand Cru,1500 ml,,c,
4,v.82256.1999.b,"Claude Cazals, Blanc de Blancs Brut Grand Cru,...",SPARKLING,2,Purchase Price: 20,140.0,,Claude Cazals,Blanc de Blancs Brut Grand Cru,750 ml,1999.0,b,


### This part is still CUSTOM but more or less it is needed for all onboardings - This is specific for EnoWeb

In [76]:
# drop duplicates
wines = wines.drop_duplicates()

print(f'Total rows: {wines.shape[0]}')

Total rows: 2313


In [77]:
# Define the sizes-to-enum map
sizes = {
    'a': 'HALF_BOTTLE',
    'm': 'HALF_LITER',
    'b': 'BOTTLE',
    'o': 'LITER',
    'c': 'MAGNUM',
    'd': 'JEROBOAM',
    4.5: 'REHOBOAM',
    5: 'BORDEAUX_JEROBOAM',
    6: 'MATHUSALEM',
    9: 'SALMANAZAR',
    12: 'BALTHAZAR',
    15: 'NEBUCHADNEZZAR',
    18: 'MELCHIOR',
    20: 'SOLOMON',
    25: 'SOVEREIGN',
    27: 'GOLIATH',
    30: 'MELCHIZEDEK'
}

wines['size'] = wines['size'].map(sizes)

### Save new version of the winelist

In [78]:
open('v2-cleaned.csv', 'w').close()
wines.drop(columns=['full_details'], inplace=True)
wines.drop(columns=['text_size'], inplace=True)
wines.to_csv('v2-cleaned.csv', index=False)

### Print search terms

**NOTE: Before proceeding with the matching algorithm, run the viviner and insert new wines.**

In [79]:
with open('search-terms.txt', 'w') as f:
    for term in wines['name'].unique():
        f.write(f'{term}\n')
    for term in wines['winery_name'].unique():
        f.write(f'{term}\n')