# Prepare wines from a dirty list

Load a CSV of wines (aka winelist) and clean the data to get it ready to run it through the matching algorithm.


In [63]:
import pandas as pd

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load dirty winelist file and perform cleaning


In [64]:
original_wines = pd.read_csv('v1-cleaned.csv')
print(f'Total rows: {original_wines.shape[0]}')
print()

original_wines.head()


Total rows: 472



Unnamed: 0,external_id,size,name,winery_name,vintage,supplier,purchase_price,price,retail_price,glass_price,quantity
0,1.0,BOTTLE,Le Petit,Weingut Manincor,2018.0,WD,37.40 €,80.00 €,75.29 €,€ 13.33,6
1,2.0,BOTTLE,"Gewürztraminer Passito ""Cresta""",Weingut Hans Rottensteiner,2017.0,WD,20.90 €,57.00 €,42.07 €,€ 9.50,9
2,3.0,BOTTLE,Rosenmuskateller Rosis,Kellerei Bozen,2020.0,FW,18.00 €,53.00 €,36.23 €,€ 8.83,9
3,4.0,BOTTLE,Sweet Claire,Weingut Lieselehof,2017.0,Liesele,39.90 €,77.00 €,80.32 €,€ 12.83,5
4,5.0,BOTTLE,Sweet Claire Quintessenz,Weingut Lieselehof,2015.0,Liesele,94.73 €,199.00 €,190.69 €,€ 33.17,2


We need the following fields:

- `external_id`
- `name`
- `winery_name`
- `type`
- `storage_area`
- `size`
- `vintage`
- `price`
- `info`
- `quantity`
- `internal_notes`

---


### Note: This part is always CUSTOM to the client


- external_id OK
- type missing
- name OK
- winery_name OK
- storage_area missing
- size OK
- vintage drop if not a number or is 0
- price -> remove currency symbol and transform to cents
- quantity OK
- info missing
- internal_notes -> create from purchase_price, supplier, retail_price, glass_price 

In [65]:
import math

# create a copy from original wines to avoid modifying data when rerunning this cell
wines = original_wines.copy()

# add empty type
wines['type'] = ''

# add empty storage_area
wines['storage_area'] = ''

# set 0 vintage year to None and NaN vintage year to None
wines['vintage'] = wines['vintage'].apply(lambda x: None if pd.isnull(x) or pd.isna(x) else int(x))

# set price
wines['price'] = wines['price'].apply(lambda x: x.replace('€', '') if pd.notnull(x) else None)
wines['price'] = wines['price'].apply(lambda x: float(x) * 100 if pd.notnull(x) else None)

# add empty info
wines['info'] = ''

# create internal_notes, aggregating purchase_price, supplier, retail_price, glass_price
wines['internal_notes'] = wines.apply(lambda x: f'purchase_price: {x["purchase_price"]}, supplier: {x["supplier"]}, retail_price: {x["retail_price"]}, glass_price: {x["glass_price"]}', axis=1)

wines.head()

Unnamed: 0,external_id,size,name,winery_name,vintage,supplier,purchase_price,price,retail_price,glass_price,quantity,type,storage_area,info,internal_notes
0,1.0,BOTTLE,Le Petit,Weingut Manincor,2018.0,WD,37.40 €,8000.0,75.29 €,€ 13.33,6,,,,"purchase_price: 37.40 € , supplier: WD, retai..."
1,2.0,BOTTLE,"Gewürztraminer Passito ""Cresta""",Weingut Hans Rottensteiner,2017.0,WD,20.90 €,5700.0,42.07 €,€ 9.50,9,,,,"purchase_price: 20.90 € , supplier: WD, retai..."
2,3.0,BOTTLE,Rosenmuskateller Rosis,Kellerei Bozen,2020.0,FW,18.00 €,5300.0,36.23 €,€ 8.83,9,,,,"purchase_price: 18.00 € , supplier: FW, retai..."
3,4.0,BOTTLE,Sweet Claire,Weingut Lieselehof,2017.0,Liesele,39.90 €,7700.0,80.32 €,€ 12.83,5,,,,"purchase_price: 39.90 € , supplier: Liesele, ..."
4,5.0,BOTTLE,Sweet Claire Quintessenz,Weingut Lieselehof,2015.0,Liesele,94.73 €,19900.0,190.69 €,€ 33.17,2,,,,"purchase_price: 94.73 € , supplier: Liesele, ..."


In [66]:
# clean sizes

# set size to BOTTLE if empty
wines['size'] = wines['size'].apply(lambda x: 'BOTTLE' if pd.isnull(x) or pd.isna(x) else x)

print(wines['size'].unique())

['BOTTLE' 'MAGNUM']


In [67]:
wines.head()

Unnamed: 0,external_id,size,name,winery_name,vintage,supplier,purchase_price,price,retail_price,glass_price,quantity,type,storage_area,info,internal_notes
0,1.0,BOTTLE,Le Petit,Weingut Manincor,2018.0,WD,37.40 €,8000.0,75.29 €,€ 13.33,6,,,,"purchase_price: 37.40 € , supplier: WD, retai..."
1,2.0,BOTTLE,"Gewürztraminer Passito ""Cresta""",Weingut Hans Rottensteiner,2017.0,WD,20.90 €,5700.0,42.07 €,€ 9.50,9,,,,"purchase_price: 20.90 € , supplier: WD, retai..."
2,3.0,BOTTLE,Rosenmuskateller Rosis,Kellerei Bozen,2020.0,FW,18.00 €,5300.0,36.23 €,€ 8.83,9,,,,"purchase_price: 18.00 € , supplier: FW, retai..."
3,4.0,BOTTLE,Sweet Claire,Weingut Lieselehof,2017.0,Liesele,39.90 €,7700.0,80.32 €,€ 12.83,5,,,,"purchase_price: 39.90 € , supplier: Liesele, ..."
4,5.0,BOTTLE,Sweet Claire Quintessenz,Weingut Lieselehof,2015.0,Liesele,94.73 €,19900.0,190.69 €,€ 33.17,2,,,,"purchase_price: 94.73 € , supplier: Liesele, ..."


### This part is still CUSTOM but more or less it is needed for all onboardings - This is specific for EnoWeb

In [68]:
# drop duplicates
wines = wines.drop_duplicates()

print(f'Total rows: {wines.shape[0]}')

Total rows: 472


In [69]:
# Define the sizes-to-enum map
sizes = {
    '0.375': 'HALF_BOTTLE',
    '0.5': 'HALF_LITER',
    '0.75': 'BOTTLE',
    '1': 'LITER',
    '1.5': 'MAGNUM',
    '3': 'JEROBOAM',
    '4.5': 'REHOBOAM',
    '5': 'BORDEAUX_JEROBOAM',
    '6': 'MATHUSALEM',
    '9': 'SALMANAZAR',
    '12': 'BALTHAZAR',
    '15': 'NEBUCHADNEZZAR',
    '18': 'MELCHIOR',
    '20': 'SOLOMON',
    '25': 'SOVEREIGN',
    '27': 'GOLIATH',
    '30': 'MELCHIZEDEK'
}

# skip this cuz we already mapped
# wines['size'] = wines['size'].map(sizes)

wines.head()

Unnamed: 0,external_id,size,name,winery_name,vintage,supplier,purchase_price,price,retail_price,glass_price,quantity,type,storage_area,info,internal_notes
0,1.0,BOTTLE,Le Petit,Weingut Manincor,2018.0,WD,37.40 €,8000.0,75.29 €,€ 13.33,6,,,,"purchase_price: 37.40 € , supplier: WD, retai..."
1,2.0,BOTTLE,"Gewürztraminer Passito ""Cresta""",Weingut Hans Rottensteiner,2017.0,WD,20.90 €,5700.0,42.07 €,€ 9.50,9,,,,"purchase_price: 20.90 € , supplier: WD, retai..."
2,3.0,BOTTLE,Rosenmuskateller Rosis,Kellerei Bozen,2020.0,FW,18.00 €,5300.0,36.23 €,€ 8.83,9,,,,"purchase_price: 18.00 € , supplier: FW, retai..."
3,4.0,BOTTLE,Sweet Claire,Weingut Lieselehof,2017.0,Liesele,39.90 €,7700.0,80.32 €,€ 12.83,5,,,,"purchase_price: 39.90 € , supplier: Liesele, ..."
4,5.0,BOTTLE,Sweet Claire Quintessenz,Weingut Lieselehof,2015.0,Liesele,94.73 €,19900.0,190.69 €,€ 33.17,2,,,,"purchase_price: 94.73 € , supplier: Liesele, ..."


### Save new version of the winelist

In [70]:
# drop columns that are not the required ones
# - `external_id`
# - `name`
# - `winery_name`
# - `type`
# - `storage_area`
# - `size`
# - `vintage`
# - `price`
# - `info`
# - `quantity`
# - `internal_notes`

wines = wines[['external_id', 'name', 'winery_name', 'type', 'storage_area', 'size', 'vintage', 'price', 'info', 'quantity', 'internal_notes']]

In [71]:
open('v2-cleaned.csv', 'w').close()

wines.to_csv('v2-cleaned.csv', index=False)

### Print search terms

**NOTE: Before proceeding with the matching algorithm, run the viviner and insert new wines.**

In [23]:
with open('search-terms.txt', 'w') as f:
    for term in wines['name'].unique():
        f.write(f'{term}\n')
    for term in wines['winery_name'].unique():
        f.write(f'{term}\n')