# Prepare wines from a dirty list

Load a CSV of wines (aka winelist) and clean the data to get it ready to run it through the matching algorithm.


In [16]:
import pandas as pd

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load dirty winelist file and perform cleaning


In [17]:
wines = pd.read_csv('v1-cleaned.csv')
print(f'Total rows: {wines.shape[0]}')
print()

wines.head()


Total rows: 210



Unnamed: 0,external_id,type,name_one,name_two,Stato,Zona,winery_name,vintage,size,price,cost,quantity
0,87675650000.0,,ANJOU,EFFUSION,FRANCIA,LOIRA,DOMAINE PATRICK BAUDOUIN,2021.0,075,55,25.5,3.0
1,87964370000.0,,ANJOU,LE CORNILLARD,FRANCIA,LOIRA,DOMAINE PATRICK BAUDOUIN,2020.0,075,90,39.5,3.0
2,76778690000000.0,,ARNEIS,ROERO ARNEIS,PIEMONTE,ROERO,GIACOMO FENOCCHIO,2022.0,075,28,11.0,16.0
3,,RED,BARBARESCO,ASILI,PIEMONTE,BARBARESCO,AZ FALLETTO DI BRUNO GIACOSA,2015.0,075,200,135.0,1.0
4,8033255000000.0,RED,BARBARESCO,ASILI,PIEMONTE,BARBARESCO,CERETTO,2018.0,MG,190,98.0,2.0


We need the following fields:

- `external_id`
- `name`
- `winery_name`
- `type`
- `storage_area`
- `size`
- `vintage`
- `price`
- `info`
- `quantity`
- `internal_notes`

---


### Note: This part is always CUSTOM to the client


- Merge name_one and name_two.

Other fields:
- external_id OK
- type OK
- name: merge name_one + name_two
- winery_name OK
- storage_area missing
- size to parse
- vintage drop if not a number
- price in euros
- info missing
- quantity OK
- internal_notes contains the purchase price. Add a label


In [18]:
import re

# merge name_one and name_two into name
wines['name'] = wines['name_one'].fillna('') + ' ' + wines['name_two'].fillna('')
wines['name'] = wines['name'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

wines.head()


Unnamed: 0,external_id,type,name_one,name_two,Stato,Zona,winery_name,vintage,size,price,cost,quantity,name
0,87675650000.0,,ANJOU,EFFUSION,FRANCIA,LOIRA,DOMAINE PATRICK BAUDOUIN,2021.0,075,55,25.5,3.0,ANJOU EFFUSION
1,87964370000.0,,ANJOU,LE CORNILLARD,FRANCIA,LOIRA,DOMAINE PATRICK BAUDOUIN,2020.0,075,90,39.5,3.0,ANJOU LE CORNILLARD
2,76778690000000.0,,ARNEIS,ROERO ARNEIS,PIEMONTE,ROERO,GIACOMO FENOCCHIO,2022.0,075,28,11.0,16.0,ARNEIS ROERO ARNEIS
3,,RED,BARBARESCO,ASILI,PIEMONTE,BARBARESCO,AZ FALLETTO DI BRUNO GIACOSA,2015.0,075,200,135.0,1.0,BARBARESCO ASILI
4,8033255000000.0,RED,BARBARESCO,ASILI,PIEMONTE,BARBARESCO,CERETTO,2018.0,MG,190,98.0,2.0,BARBARESCO ASILI


In [19]:
import math

# add empty area
wines['storage_area'] = ''

# add empty info
wines['info'] = ''

# set 0 vintage year to None and NaN vintage year to None
wines['vintage'] = wines['vintage'].apply(lambda x: None if pd.isnull(x) or pd.isna(x) else int(x))

# add label to purchase price in internal_notes
wines['internal_notes'] = wines['cost'].apply(lambda x: f'Purchase Price: {x}')

wines.head(68)

Unnamed: 0,external_id,type,name_one,name_two,Stato,Zona,winery_name,vintage,size,price,cost,quantity,name,storage_area,info,internal_notes
0,8.767565e+10,,ANJOU,EFFUSION,FRANCIA,LOIRA,DOMAINE PATRICK BAUDOUIN,2021.0,075,55,25.5,3.0,ANJOU EFFUSION,,,Purchase Price: 25.5
1,8.796437e+10,,ANJOU,LE CORNILLARD,FRANCIA,LOIRA,DOMAINE PATRICK BAUDOUIN,2020.0,075,90,39.5,3.0,ANJOU LE CORNILLARD,,,Purchase Price: 39.5
2,7.677869e+13,,ARNEIS,ROERO ARNEIS,PIEMONTE,ROERO,GIACOMO FENOCCHIO,2022.0,075,28,11.0,16.0,ARNEIS ROERO ARNEIS,,,Purchase Price: 11.0
3,,RED,BARBARESCO,ASILI,PIEMONTE,BARBARESCO,AZ FALLETTO DI BRUNO GIACOSA,2015.0,075,200,135.0,1.0,BARBARESCO ASILI,,,Purchase Price: 135.0
4,8.033255e+12,RED,BARBARESCO,ASILI,PIEMONTE,BARBARESCO,CERETTO,2018.0,MG,190,98.0,2.0,BARBARESCO ASILI,,,Purchase Price: 98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,8.025494e+12,RED,CABERNET FRANC,CABERNET FRANC,FRIULI VENEZIA GIULIA,COLLIO,RUSSIZ SUPERIORE,2021.0,075,33,12.0,16.0,CABERNET FRANC CABERNET FRANC,,,Purchase Price: 12.0
64,8.765466e+13,SPARKLING,CHAMPAGNE,BLANC DE BLANCS,FRANCIA,CHAMPAGNE,MAURICE GRUMIER,,075,90,37.0,6.0,CHAMPAGNE BLANC DE BLANCS,,,Purchase Price: 37.0
65,5.678976e+09,SPARKLING,CHAMPAGNE,BLANC DE BLANCS EB GRAN CRU,FRANCIA,CHAMPAGNE,VAUVERSIN,,075,70,32.0,12.0,CHAMPAGNE BLANC DE BLANCS EB GRAN CRU,,,Purchase Price: 32.0
66,3.760105e+12,SPARKLING,CHAMPAGNE,BLANC DE BLANCS GRAN CRU,FRANCIA,CHAMPAGNE,LEGRAS & HAAS,,075,85,39.0,1.0,CHAMPAGNE BLANC DE BLANCS GRAN CRU,,,Purchase Price: 39.0


In [20]:
# clean sizes
print(wines['size'].unique())

# replace 0,75 with 0.75
wines['size'] = wines['size'].apply(lambda x: re.sub(r'0,75', '0.75', x))

# replace MG with 1.5
wines['size'] = wines['size'].apply(lambda x: re.sub(r'MG', '1.5', x))

print(wines['size'].unique())

['0,75' 'MG']
['0.75' '1.5']


In [21]:
wines.head()


Unnamed: 0,external_id,type,name_one,name_two,Stato,Zona,winery_name,vintage,size,price,cost,quantity,name,storage_area,info,internal_notes
0,87675650000.0,,ANJOU,EFFUSION,FRANCIA,LOIRA,DOMAINE PATRICK BAUDOUIN,2021.0,0.75,55,25.5,3.0,ANJOU EFFUSION,,,Purchase Price: 25.5
1,87964370000.0,,ANJOU,LE CORNILLARD,FRANCIA,LOIRA,DOMAINE PATRICK BAUDOUIN,2020.0,0.75,90,39.5,3.0,ANJOU LE CORNILLARD,,,Purchase Price: 39.5
2,76778690000000.0,,ARNEIS,ROERO ARNEIS,PIEMONTE,ROERO,GIACOMO FENOCCHIO,2022.0,0.75,28,11.0,16.0,ARNEIS ROERO ARNEIS,,,Purchase Price: 11.0
3,,RED,BARBARESCO,ASILI,PIEMONTE,BARBARESCO,AZ FALLETTO DI BRUNO GIACOSA,2015.0,0.75,200,135.0,1.0,BARBARESCO ASILI,,,Purchase Price: 135.0
4,8033255000000.0,RED,BARBARESCO,ASILI,PIEMONTE,BARBARESCO,CERETTO,2018.0,1.5,190,98.0,2.0,BARBARESCO ASILI,,,Purchase Price: 98.0


### This part is still CUSTOM but more or less it is needed for all onboardings - This is specific for EnoWeb

In [22]:
# drop duplicates
wines = wines.drop_duplicates()

print(f'Total rows: {wines.shape[0]}')

Total rows: 210


In [15]:
# Define the sizes-to-enum map
sizes = {
    '0.375': 'HALF_BOTTLE',
    '0.5': 'HALF_LITER',
    '0.75': 'BOTTLE',
    '1': 'LITER',
    '1.5': 'MAGNUM',
    '3': 'JEROBOAM',
    '4.5': 'REHOBOAM',
    '5': 'BORDEAUX_JEROBOAM',
    '6': 'MATHUSALEM',
    '9': 'SALMANAZAR',
    '12': 'BALTHAZAR',
    '15': 'NEBUCHADNEZZAR',
    '18': 'MELCHIOR',
    '20': 'SOLOMON',
    '25': 'SOVEREIGN',
    '27': 'GOLIATH',
    '30': 'MELCHIZEDEK'
}

wines['size'] = wines['size'].map(sizes)

wines.head()

Unnamed: 0,external_id,type,name_one,name_two,Stato,Zona,winery_name,vintage,size,price,cost,quantity,name,storage_area,info,internal_notes
0,87675650000.0,,ANJOU,EFFUSION,FRANCIA,LOIRA,DOMAINE PATRICK BAUDOUIN,2021.0,,55,25.5,3.0,ANJOU EFFUSION,,,Purchase Price: 25.5
1,87964370000.0,,ANJOU,LE CORNILLARD,FRANCIA,LOIRA,DOMAINE PATRICK BAUDOUIN,2020.0,,90,39.5,3.0,ANJOU LE CORNILLARD,,,Purchase Price: 39.5
2,76778690000000.0,,ARNEIS,ROERO ARNEIS,PIEMONTE,ROERO,GIACOMO FENOCCHIO,2022.0,,28,11.0,16.0,ARNEIS ROERO ARNEIS,,,Purchase Price: 11.0
3,,RED,BARBARESCO,ASILI,PIEMONTE,BARBARESCO,AZ FALLETTO DI BRUNO GIACOSA,2015.0,,200,135.0,1.0,BARBARESCO ASILI,,,Purchase Price: 135.0
4,8033255000000.0,RED,BARBARESCO,ASILI,PIEMONTE,BARBARESCO,CERETTO,2018.0,,190,98.0,2.0,BARBARESCO ASILI,,,Purchase Price: 98.0


### Save new version of the winelist

In [51]:
# drop columns that are not the required ones
# - `external_id`
# - `name`
# - `winery_name`
# - `type`
# - `storage_area`
# - `size`
# - `vintage`
# - `price`
# - `info`
# - `quantity`
# - `internal_notes`

wines = wines[['external_id', 'name', 'winery_name', 'type', 'storage_area', 'size', 'vintage', 'price', 'info', 'quantity', 'internal_notes']]

In [52]:
open('v2-cleaned.csv', 'w').close()

wines.to_csv('v2-cleaned.csv', index=False)

### Print search terms

**NOTE: Before proceeding with the matching algorithm, run the viviner and insert new wines.**

In [53]:
with open('search-terms.txt', 'w') as f:
    for term in wines['name'].unique():
        f.write(f'{term}\n')
    for term in wines['winery_name'].unique():
        f.write(f'{term}\n')