# Prepare wines from a dirty list

Load a CSV of wines (aka winelist) and clean the data to get it ready to run it through the matching algorithm.


In [55]:
import pandas as pd

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load dirty winelist file and perform cleaning


In [56]:
wines = pd.read_csv('v1-cleaned.csv')
print(f'Total rows: {wines.shape[0]}')
print()

wines.head()


Total rows: 133



Unnamed: 0,full_details,size
0,Domaine parent corton les renardes grand cru 1...,BOTTLE
1,Laboure roi clos de vougeot grand cru 2003 +1,BOTTLE
2,Laboure roi echezeaux grand cru 2003 +1,BOTTLE
3,Laboure roi pommard les rugiens 1er cru 2003 +2,BOTTLE
4,Jaboulet vercherre charmes chambertin 1982+1,BOTTLE


We need the following fields:

- `external_id`
- `name`
- `winery_name`
- `type`
- `storage_area`
- `size`
- `vintage`
- `price`
- `info`
- `quantity`
- `internal_notes`

---


### Note: This part is always CUSTOM to the client


Drop rows without full_details or without price.

I need to extract some data from `full_details`, such as:
- name
- winery_name
- vintage
- size

Observations for `full_details` field:
- winery and wine name: _WINERY & WINE_ _YEAR_ + _QUANTITY_

Other fields:
- external_id missing
- type missing
- area missing
- price missing


In [57]:
import re

# extract data from full_details
for idx, wine in wines.iterrows():
    # extract vintage year and remove it from full_details
    vintage = re.search(r'\d{4}', wine['full_details'])
    if vintage:
        wines.at[idx, 'vintage'] = vintage.group(0)
        wines.at[idx, 'full_details'] = wine['full_details'].replace(
            vintage.group(0), '')
    else:
        wines.at[idx, 'vintage'] = None

    # extract quantity and remove it from full_details (it is after the + sign)
    quantity = re.search(r'\+\s*(\d+)', wine['full_details'])
    if quantity:
        wines.at[idx, 'quantity'] = quantity.group(1)
        wines.at[idx, 'full_details'] = wine['full_details'].replace(
            quantity.group(0), '')
    else:
        wines.at[idx, 'quantity'] = 0

    # wine and winery name are non distinguishable in full_details
    wines.at[idx, 'winery_name'] = wine['full_details']
    wines.at[idx, 'name'] = wine['full_details']

wines.head()


Unnamed: 0,full_details,size,vintage,quantity,winery_name,name
0,Domaine parent corton les renardes grand cru,BOTTLE,1998,2,Domaine parent corton les renardes grand cru,Domaine parent corton les renardes grand cru
1,Laboure roi clos de vougeot grand cru,BOTTLE,2003,1,Laboure roi clos de vougeot grand cru,Laboure roi clos de vougeot grand cru
2,Laboure roi echezeaux grand cru,BOTTLE,2003,1,Laboure roi echezeaux grand cru,Laboure roi echezeaux grand cru
3,Laboure roi pommard les rugiens 1er cru,BOTTLE,2003,2,Laboure roi pommard les rugiens 1er cru,Laboure roi pommard les rugiens 1er cru
4,Jaboulet vercherre charmes chambertin,BOTTLE,1982,1,Jaboulet vercherre charmes chambertin,Jaboulet vercherre charmes chambertin


In [58]:

# empty external_id
wines['external_id'] = ''

# empty type
wines['type'] = ''

# add empty area
wines['storage_area'] = ''

# add empty details
wines['info'] = ''

# add empty price
wines['price'] = 0

# add empty internal_notes
wines['internal_notes'] = ''


In [59]:
# clean sizes
print(wines['size'].unique())

['BOTTLE' 'MAGNUM']


In [60]:
wines.head()


Unnamed: 0,full_details,size,vintage,quantity,winery_name,name,external_id,type,storage_area,info,price,internal_notes
0,Domaine parent corton les renardes grand cru,BOTTLE,1998,2,Domaine parent corton les renardes grand cru,Domaine parent corton les renardes grand cru,,,,,0,
1,Laboure roi clos de vougeot grand cru,BOTTLE,2003,1,Laboure roi clos de vougeot grand cru,Laboure roi clos de vougeot grand cru,,,,,0,
2,Laboure roi echezeaux grand cru,BOTTLE,2003,1,Laboure roi echezeaux grand cru,Laboure roi echezeaux grand cru,,,,,0,
3,Laboure roi pommard les rugiens 1er cru,BOTTLE,2003,2,Laboure roi pommard les rugiens 1er cru,Laboure roi pommard les rugiens 1er cru,,,,,0,
4,Jaboulet vercherre charmes chambertin,BOTTLE,1982,1,Jaboulet vercherre charmes chambertin,Jaboulet vercherre charmes chambertin,,,,,0,


### This part is still CUSTOM but more or less it is needed for all onboardings

In [61]:
# drop duplicates
wines = wines.drop_duplicates()

print(f'Total rows: {wines.shape[0]}')

Total rows: 133


### Save new version of the winelist

In [62]:
open('v2-cleaned.csv', 'w').close()
wines.drop(columns=['full_details'], inplace=True)
wines.to_csv('v2-cleaned.csv', index=False)

### Print search terms

**NOTE: Before proceeding with the matching algorithm, run the viviner and insert new wines.**

In [63]:
with open('search-terms.txt', 'w') as f:
    for term in wines['name'].unique():
        f.write(f'{term}\n')
    for term in wines['winery_name'].unique():
        f.write(f'{term}\n')