# Prepare wines from a dirty list

Load a CSV of wines (aka winelist) and clean the data to get it ready to run it through the matching algorithm.


In [14]:
import pandas as pd

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load dirty winelist file and perform cleaning


In [15]:
wines = pd.read_csv('v1-cleaned.csv')
print(f'Total rows: {wines.shape[0]}')
print()

wines.head()


Total rows: 349



Unnamed: 0,name,size,vintage,price
0,Prosecco Brut DOCG Adami,,,2500.0
1,Cartizze DRY docg Adami,,,5500.0
2,Altemasi Trento Millesimato Cavit,,2018.0,4500.0
3,"Trento DOC Brut 51,151 Moser",,,5900.0
4,Ferrari Perlè F.lli Lunelli,,2017.0,7000.0


We need the following fields:

- `external_id`
- `name`
- `winery_name`
- `type`
- `storage_area`
- `size`
- `vintage`
- `price`
- `info`
- `quantity`
- `internal_notes`

---


### Note: This part is always CUSTOM to the client


- external_id missing
- type missing
- name and winery_name together
- storage_area missing
- size OK for other formats. Set to BOTTLE for empty values
- vintage drop if not a number
- price in cents
- quantity set to 50
- info missing
- internal_notes missing

In [16]:
import math

# copy name to winery_name
wines['winery_name'] = wines['name']

# add empty external_id
wines['external_id'] = ''

# add empty type
wines['type'] = ''

# add empty storage_area
wines['storage_area'] = ''

# add empty info
wines['info'] = ''

# add empty notes
wines['internal_notes'] = ''

# set 0 vintage year to None and NaN vintage year to None
wines['vintage'] = wines['vintage'].apply(lambda x: None if pd.isnull(x) or pd.isna(x) else int(x))

# initial quantity to 50
wines['quantity'] = 50

wines.head()

Unnamed: 0,name,size,vintage,price,winery_name,external_id,type,storage_area,info,internal_notes,quantity
0,Prosecco Brut DOCG Adami,,,2500.0,Prosecco Brut DOCG Adami,,,,,,50
1,Cartizze DRY docg Adami,,,5500.0,Cartizze DRY docg Adami,,,,,,50
2,Altemasi Trento Millesimato Cavit,,2018.0,4500.0,Altemasi Trento Millesimato Cavit,,,,,,50
3,"Trento DOC Brut 51,151 Moser",,,5900.0,"Trento DOC Brut 51,151 Moser",,,,,,50
4,Ferrari Perlè F.lli Lunelli,,2017.0,7000.0,Ferrari Perlè F.lli Lunelli,,,,,,50


In [17]:
# clean sizes

# set size to BOTTLE if empty
wines['size'] = wines['size'].apply(lambda x: 'BOTTLE' if pd.isnull(x) or pd.isna(x) else x)

print(wines['size'].unique())

['BOTTLE' 'HALF_BOTTLE' 'MAGNUM' 'MATHUSALEM' 'JEROBOAM' 'HALF_LITER'
 'BALTHAZAR']


In [18]:
wines.head()

Unnamed: 0,name,size,vintage,price,winery_name,external_id,type,storage_area,info,internal_notes,quantity
0,Prosecco Brut DOCG Adami,BOTTLE,,2500.0,Prosecco Brut DOCG Adami,,,,,,50
1,Cartizze DRY docg Adami,BOTTLE,,5500.0,Cartizze DRY docg Adami,,,,,,50
2,Altemasi Trento Millesimato Cavit,BOTTLE,2018.0,4500.0,Altemasi Trento Millesimato Cavit,,,,,,50
3,"Trento DOC Brut 51,151 Moser",BOTTLE,,5900.0,"Trento DOC Brut 51,151 Moser",,,,,,50
4,Ferrari Perlè F.lli Lunelli,BOTTLE,2017.0,7000.0,Ferrari Perlè F.lli Lunelli,,,,,,50


### This part is still CUSTOM but more or less it is needed for all onboardings - This is specific for EnoWeb

In [19]:
# drop duplicates
wines = wines.drop_duplicates()

print(f'Total rows: {wines.shape[0]}')

Total rows: 349


In [20]:
# Define the sizes-to-enum map
sizes = {
    '0.375': 'HALF_BOTTLE',
    '0.5': 'HALF_LITER',
    '0.75': 'BOTTLE',
    '1': 'LITER',
    '1.5': 'MAGNUM',
    '3': 'JEROBOAM',
    '4.5': 'REHOBOAM',
    '5': 'BORDEAUX_JEROBOAM',
    '6': 'MATHUSALEM',
    '9': 'SALMANAZAR',
    '12': 'BALTHAZAR',
    '15': 'NEBUCHADNEZZAR',
    '18': 'MELCHIOR',
    '20': 'SOLOMON',
    '25': 'SOVEREIGN',
    '27': 'GOLIATH',
    '30': 'MELCHIZEDEK'
}

# skip this cuz we already mapped
# wines['size'] = wines['size'].map(sizes)

wines.head()

Unnamed: 0,name,size,vintage,price,winery_name,external_id,type,storage_area,info,internal_notes,quantity
0,Prosecco Brut DOCG Adami,BOTTLE,,2500.0,Prosecco Brut DOCG Adami,,,,,,50
1,Cartizze DRY docg Adami,BOTTLE,,5500.0,Cartizze DRY docg Adami,,,,,,50
2,Altemasi Trento Millesimato Cavit,BOTTLE,2018.0,4500.0,Altemasi Trento Millesimato Cavit,,,,,,50
3,"Trento DOC Brut 51,151 Moser",BOTTLE,,5900.0,"Trento DOC Brut 51,151 Moser",,,,,,50
4,Ferrari Perlè F.lli Lunelli,BOTTLE,2017.0,7000.0,Ferrari Perlè F.lli Lunelli,,,,,,50


### Save new version of the winelist

In [21]:
# drop columns that are not the required ones
# - `external_id`
# - `name`
# - `winery_name`
# - `type`
# - `storage_area`
# - `size`
# - `vintage`
# - `price`
# - `info`
# - `quantity`
# - `internal_notes`

wines = wines[['external_id', 'name', 'winery_name', 'type', 'storage_area', 'size', 'vintage', 'price', 'info', 'quantity', 'internal_notes']]

In [22]:
open('v2-cleaned.csv', 'w').close()

wines.to_csv('v2-cleaned.csv', index=False)

### Print search terms

**NOTE: Before proceeding with the matching algorithm, run the viviner and insert new wines.**

In [23]:
with open('search-terms.txt', 'w') as f:
    for term in wines['name'].unique():
        f.write(f'{term}\n')
    for term in wines['winery_name'].unique():
        f.write(f'{term}\n')