# Prepare wines from a dirty list

Load a CSV of wines (aka winelist) and clean the data to get it ready to run it through the matching algorithm.


In [473]:
import pandas as pd

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load dirty winelist file and perform cleaning


In [474]:
wines_original = pd.read_csv('v1-cleaned.csv')
print(f'Total rows: {wines_original.shape[0]}')
print()

wines_original.head()


Total rows: 928



Unnamed: 0,external_id,name_one,name_two,size,winery_name,country,vintage,currency,price,quantity,Unnamed: 10
0,1,Demi: 375 ml,,,,,,,,,
1,2,Champagne,Rosé,375 ml,Ruinart,FRA,,€,"€ 85,00",23.0,
2,3,Champagne,Blanc de Blanc,375 ml,Ruinart,FRA,,€,"€ 80,00",19.0,
3,4,Venegazzù della Casa,,375 ml,Loredan e Gasperin,ITA,17.0,€,"€ 25,00",,
4,5,Santa Maddalena,,375 ml,Cantina di Bolzano,ITA,21.0,€,"€ 15,00",,


### Note: This part is always CUSTOM to the client


- Merge `name_one` and `name_two`.

Other fields:
- `external_id`: OK
- `name`: merge name_one + name_two
- `winery_name`: substitute nan with ''
- `type`: missing
- `storage_area`: missing
- `size`: drop "al bicchiere" rows, to parse
- `vintage`: to parse
- `price`: to parse
- `info`: missing
- `quantity`: to parse
- `internal_notes`: missing
- `country`: extra field, is already in DB



In [475]:
wines = wines_original.copy()
rows_to_drop = pd.DataFrame(columns=list(wines.columns)+['reason'])

def add_to_drop(condition, reason=''):
    to_drop = wines_original[condition]
    # Only keep rows that are not already in dropped_rows
    to_drop = to_drop[~to_drop.index.isin(rows_to_drop.index)]
    to_drop['reason'] = reason
    return pd.concat([rows_to_drop, to_drop])

import re

# external id ok

# merge name_one and name_two into name
wines['name'] = wines['name_one'].fillna('') + ' ' + wines['name_two'].fillna('')
wines['name'] = wines['name'].apply(lambda x: re.sub(r'\s+', ' ', x).strip().lower())

wines = wines.drop(columns=['name_one', 'name_two'])

# winery_name: substitute nan with empty string
wines_original['winery_name'].unique()[:3]

array([nan, 'Ruinart', 'Loredan e Gasperin'], dtype=object)

In [476]:
wines['winery_name'] = wines_original['winery_name'].fillna('')
wines['winery_name'].unique()[:3]

array(['', 'Ruinart', 'Loredan e Gasperin'], dtype=object)

In [477]:
# type: missing
wines['type'] = ''

# storage_area: missing
wines['storage_area'] = ''

In [478]:
# `size`: drop "al bicchiere" rows
wines_original['size'].unique()

array([nan, '375 ml', 'Magnum', 'Jéroboam', '750 ml', '500 ml',
       'al bicchiere'], dtype=object)

In [479]:
sizes = {
    '375 ml': 'HALF_BOTTLE',
    '500 ml': 'HALF_LITER',
    '750 ml': 'BOTTLE',
    'Magnum': 'MAGNUM',
    'Jéroboam': 'JEROBOAM',
    'al bicchiere': 'al bicchiere'
}

rows_to_drop = add_to_drop(wines_original['size'] == 'al bicchiere', 'size is "al bicchiere"')

wines['size'] = wines_original['size'].apply(lambda x: '750 ml' if pd.isna(x) else x)
wines['size'] = wines['size'].map(sizes)
wines['size'].unique()

array(['BOTTLE', 'HALF_BOTTLE', 'MAGNUM', 'JEROBOAM', 'HALF_LITER',
       'al bicchiere'], dtype=object)

In [480]:
# vintage: to parse
# set 0 vintage year to None and NaN vintage year to None
wines['vintage'] = wines_original['vintage'].apply(lambda x: None if pd.isnull(x) or pd.isna(x) else re.sub('\D', '', x))
wines['vintage'].unique()

array([None, '17', '21', '2122', '1920', '20', '19', '04', '8', '14',
       '10', '2008', '12', '13', '2005', '16', '18', '6', '19792019',
       '22', '23', '5', '68', '09', '11', '15', '06', '171920', '1975',
       '7', '2', '1719', '1720', '1821', '2016', '73', '2021', '9', '3',
       '97', '1', '95', '96', '98', '90', '99'], dtype=object)

In [481]:
# correct vintage and split vintages with multiple years
# later I am gonna use them to create multiple rows
wines['vintage'] = wines['vintage'].apply(lambda x: ('2021', '2022') if x == '2122' else x)
wines['vintage'] = wines['vintage'].apply(lambda x: ('19', '20') if x == '1920' else x)
wines['vintage'] = wines['vintage'].apply(lambda x: ('1979', '2019') if x == '19792019' else x)
wines['vintage'] = wines['vintage'].apply(lambda x: ('20', '21') if x == '2021' else x)
wines['vintage'] = wines['vintage'].apply(lambda x: ('17', '19', '20') if x == '171920' else x)
wines['vintage'] = wines['vintage'].apply(lambda x: ('17', '19') if x == '1719' else x)
wines['vintage'] = wines['vintage'].apply(lambda x: ('17', '20') if x == '1720' else x)
wines['vintage'] = wines['vintage'].apply(lambda x: ('18', '21') if x == '1821' else x)
wines['vintage'] = wines['vintage'].apply(lambda x: ('13', '19') if x == '1319' else x)

wines['vintage'].unique()

array([None, '17', '21', ('2021', '2022'), ('19', '20'), '20', '19', '04',
       '8', '14', '10', '2008', '12', '13', '2005', '16', '18', '6',
       ('1979', '2019'), '22', '23', '5', '68', '09', '11', '15', '06',
       ('17', '19', '20'), '1975', '7', '2', ('17', '19'), ('17', '20'),
       ('18', '21'), '2016', '73', ('20', '21'), '9', '3', '97', '1',
       '95', '96', '98', '90', '99'], dtype=object)

In [482]:
wines.loc[wines['vintage']==('1979', '2019')]

Unnamed: 0,external_id,size,winery_name,country,vintage,currency,price,quantity,Unnamed: 10,name,type,storage_area
89,90,BOTTLE,Loredan Gasparini,ITA,"(1979, 2019)",€,"€ 45,00",,,spumante extra brut,,


In [483]:
# price: to parse

wines_original['price'].unique()

array([nan, '€ 85,00', '€ 80,00', '€ 25,00', '€ 15,00', '€ 24,00',
       '€ 18,00', '€ 23,00', '€ 130,00', '€ 27,00', '€ 42,00', '€ 16,00',
       '€ 95,00', '€ 110,00', '€ 230,00', '€ 160,00', '€ 850,00',
       '€ 400,00', '€ 380,00', '€ 90,00', '€ 450,00', '€ 900,00',
       '€ 210,00', '€ 220,00', '€ 420,00', '€ 140,00', '€ 320,00',
       '€ 240,00', '€ 190,00', '€ 150,00', '€ 50,00', '€ 105,00',
       '€ 54,00', '€ 100,00', '€ 60,00', '€ 65,00', '€ 250,00', '€ 38,00',
       '€ 45,00', '€ 48,00', '€ 30,00', '€ 28,00', '€ 70,00', '€ 33,00',
       '€ 64,00', '€ 120,00', '€ 39,00', '€ 40,00', '€ 29,00', '€ 26,00',
       '€ 36,00', '€ 98,00', '€ 19,00', '€ 56,00', '€ 21,00', '€ 55,00',
       '€ 63,00', '€ 74,00', '€ 75,00', '€ 35,00', '€ 34,00', '€ 52,00',
       '€ 104,00', '€ 82,00', '€ 78,00', '€ 20,00', '€ 72,00', '€ 73,00',
       '€ 66,00', '€ 88,00', '€ 270,00', '€ 340,00', '€ 407,00',
       '€ 410,00', '€ 860,00', '€ 890,00', '€ 180,00', '€ 260,00',
       '€ 58,00', '€

In [484]:
rows_to_drop = add_to_drop(wines_original['price'] == 'PREZZI AL BICCHIERE', reason='price is "PREZZI AL BICCHIERE"')

In [485]:
wines['price'] = wines_original['price'].apply(lambda x: x.replace('.', '').replace(',', '.').replace('€', '').strip() if not pd.isna(x) else x)
wines['price'] = wines['price'].apply(lambda x: float(x) if not pd.isna(x) and '.' in x else x)
wines['price'].unique()

array([nan, 85.0, 80.0, 25.0, 15.0, 24.0, 18.0, 23.0, 130.0, 27.0, 42.0,
       16.0, 95.0, 110.0, 230.0, 160.0, 850.0, 400.0, 380.0, 90.0, 450.0,
       900.0, 210.0, 220.0, 420.0, 140.0, 320.0, 240.0, 190.0, 150.0,
       50.0, 105.0, 54.0, 100.0, 60.0, 65.0, 250.0, 38.0, 45.0, 48.0,
       30.0, 28.0, 70.0, 33.0, 64.0, 120.0, 39.0, 40.0, 29.0, 26.0, 36.0,
       98.0, 19.0, 56.0, 21.0, 55.0, 63.0, 74.0, 75.0, 35.0, 34.0, 52.0,
       104.0, 82.0, 78.0, 20.0, 72.0, 73.0, 66.0, 88.0, 270.0, 340.0,
       407.0, 410.0, 860.0, 890.0, 180.0, 260.0, 58.0, 37.0, 68.0, 185.0,
       145.0, 32.0, 170.0, 840.0, 125.0, 330.0, 132.0, 86.0, 200.0, 360.0,
       57.0, 123.0, 22.0, 62.0, 99.0, 46.0, 155.0, 280.0, 300.0, 650.0,
       2000.0, 580.0, 31.0, 1.25, 51.0, 126.0, 1250.0, 1200.0, 94.0,
       480.0, 520.0, 10.0, 1000.0, 8.0, 7.0, 6.0, 'PREZZI AL BICCHIERE',
       5.0, 12.0, 14.0, 9.0], dtype=object)

In [486]:
# add empty info
wines['info'] = ''

In [487]:
# parse quantity
wines_original['quantity'].unique()

array([nan, '23', '19', '10', '4', '2', '1 ‘04, 2 ‘08', '3', '13', '83',
       '14', '1', '12', '2 166 ed, 2 Grande cuvee pre 08',
       '10 244, 3 242, 2 243', '5', '6', '9', '27', '13 ‘14', '173',
       '7 n. 023, 5 n. 024', '8', '16', '1 ‘08, 1 ‘10', '15', '7', '82',
       '9 ‘21, 11 ‘20, 27 ’22', '42', '24', '44', '27 ‘22, 1 ‘21',
       '2 ‘21 5 ‘22', '17', '3 ‘18, 3 ‘19, 13 ‘21 ', '3 ‘21, 8 ‘22',
       '2 ‘18, 3 ‘19, 4 ‘20', '20', '9 ‘20, 19 ‘22', '1 ‘21, 14 ‘22',
       '5 ‘20, 5 ‘22', '19 ‘22, 2 ‘20 ', '2 ‘21, 32 ‘22', '11', '32',
       '5 ‘21, 5 ‘22', '2 ‘20, 6 ‘22', '2 ‘19, 4 ‘21, 11 ‘22',
       '3 ‘22, 18 ‘23', '1 ‘18, 8 ‘21', '1 ‘21, 6 ‘22', '5 ‘19, 6 ‘21',
       '1 ‘20, 1 ‘21', 'Togliere', '2 ‘19, 9 ‘21 ', '4 ‘18, 3 ‘20',
       '12 ‘19, 26 ‘20', '2 ‘19, 3 ‘20', ' 2 ‘20 64 ‘21', '18 ‘20, 6 ‘21',
       '1 ‘19, 1, ‘18', '4 ‘18, 8 ‘19', '4 ‘21, 2 ‘22', '3 ‘04, 19 ‘05',
       '3’ 20, 1 ‘17', '3 ‘21, 5 ‘22', '4 ‘21, 6 ‘22', '1 ‘13, 1 ‘16',
       '2 ‘10, 1 ‘16', '1 ‘1

In [488]:
wines['quantity'] = wines_original['quantity'].apply(lambda x: '0' if pd.isna(x) else x)

rows_to_drop = add_to_drop(wines_original['quantity'].apply(lambda x: len(str(x))) > 4, reason='quantity is too long')
rows_to_drop = add_to_drop(wines['quantity'].apply(lambda x: ',' in x), reason='quantity has a decimal')

# convet to int
wines['quantity'] = wines['quantity'].drop(rows_to_drop.index).apply(lambda x: int(x))
wines['quantity'].unique()


array([  0.,  23.,  19.,  10.,   4.,   2.,  nan,   3.,  13.,  83.,  14.,
         1.,  12.,   5.,   6.,   9.,  27., 173.,   8.,  16.,  15.,   7.,
        82.,  42.,  24.,  44.,  17.,  20.,  11.,  32.,  22.,  25.,  18.,
        37.,  21.,  31.,  36.,  41.])

In [489]:
# internal_notes: missing
wines['internal_notes'] = ''

### Save new version of the winelist

In [490]:
# drop columns that are not the required ones
# - `external_id`
# - `name`
# - `winery_name`
# - `type`
# - `storage_area`
# - `size`
# - `vintage`
# - `price`
# - `info`
# - `quantity`
# - `internal_notes`

wines_out = wines[['external_id', 'name', 'winery_name', 'type', 'storage_area', 'size', 'vintage', 'price', 'info', 'quantity', 'internal_notes']]

wines_out_cleaned = wines_out.drop(rows_to_drop.index).explode('vintage')
open('v2-cleaned.csv', 'w').close()
wines_out_cleaned.to_csv('v2-cleaned.csv', index=False)

open('v2-dropped.csv', 'w').close()
wines_out_dropped = wines_out.loc[rows_to_drop.index].explode('vintage')
wines_out_dropped['reason'] = rows_to_drop['reason']
wines_out_dropped.to_csv('v2-dropped.csv', index=False)

# wines_dropped = wines.loc[rows_to_drop.index]
# wines
# rows_to_drop.to_csv('v2-dropped.csv', index=False)

### Print search terms

**NOTE: Before proceeding with the matching algorithm, run the viviner and insert new wines.**

In [491]:
with open('search-terms.txt', 'w') as f:
    for term in wines['name'].unique():
        f.write(f'{term}\n')
    for term in wines['winery_name'].unique():
        f.write(f'{term}\n')

In [496]:
wines_out.loc[[15,16,17]]

Unnamed: 0,external_id,name,winery_name,type,storage_area,size,vintage,price,info,quantity,internal_notes
15,16,champagne & ...,,,,BOTTLE,,,,0.0,
16,17,champagne cuvée brut,Laurent Perrier,,,BOTTLE,,80.0,,2.0,
17,18,champagne millesimato,Laurent Perrier,,,BOTTLE,4.0,95.0,,,
