In [78]:
import sys
sys.path.append('../../')
import os

import pandas as pd
from utils import fill_empty, VColumns
from dotenv import load_dotenv
load_dotenv()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load input file

In [79]:
df1 = pd.read_csv("v1-start.csv")

# Start elaborating columns

Create a new dataframe with empty values, so that we can start filling in values from the input dataframe, without overriding columns.

In [80]:
df1.head()

Unnamed: 0,external_id,full_details,type,purchase_price_eur,sales_price_eur
0,v.69999.2020.c,"Abbazia di Novacella, Cor unum et anima una, 2...",WHITE,,114.0
1,v.13842.2022.b,"Abbazia di Novacella, Grüner Veltliner Praepos...",WHITE,,35.0
2,v.13841.2023.b,"Abbazia di Novacella, Kerner Praepositus, 2023...",WHITE,,35.0
3,v.6142.2022.b,"Abbazia di Novacella, Riesling Praepositus, 20...",WHITE,,37.0
4,v.100767.2019.b,"Acham-Magin, Riesling Ungeheuer, 2019, 750 ml",WHITE,,91.0


In [81]:
# create new empty dataframe
df = pd.DataFrame(columns=VColumns.v2())
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible


In [82]:
# copy values from original columns to new columns
df['external_id'] = df1['external_id']
df['type'] = df1['type']
df["quantity"] = 200
df["storage_area"] = None
df["info"] = None
df["internal_notes"] = None
df["visible"] = True

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,v.69999.2020.c,WHITE,,,,,,,,200,,,True
1,v.13842.2022.b,WHITE,,,,,,,,200,,,True
2,v.13841.2023.b,WHITE,,,,,,,,200,,,True
3,v.6142.2022.b,WHITE,,,,,,,,200,,,True
4,v.100767.2019.b,WHITE,,,,,,,,200,,,True


In [83]:
def convert_price_to_cents(x):
    if pd.isna(x) or str(x).strip() == '':
        return 0
    try:
        return int(float(str(x).replace('€', '').replace(',', '').strip()) * 100)
    except (ValueError, TypeError):
        return 0

df1['sales_price_eur'] = df1['sales_price_eur'].fillna('')
df['price'] = df1['sales_price_eur'].apply(convert_price_to_cents)

df1['purchase_price_eur'] = df1['purchase_price_eur'].fillna('')
df['purchase_price'] = df1['purchase_price_eur'].apply(convert_price_to_cents)

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,v.69999.2020.c,WHITE,,,,,,11400,0,200,,,True
1,v.13842.2022.b,WHITE,,,,,,3500,0,200,,,True
2,v.13841.2023.b,WHITE,,,,,,3500,0,200,,,True
3,v.6142.2022.b,WHITE,,,,,,3700,0,200,,,True
4,v.100767.2019.b,WHITE,,,,,,9100,0,200,,,True


### This part is specific for EnoWeb

In [84]:
# Define the sizes-to-enum map
sizes_map = {
    'a': 'HALF_BOTTLE',
    'm': 'HALF_LITER',
    'b': 'BOTTLE',
    'o': 'LITER',
    'c': 'MAGNUM',
    'u': 'MAGNUM',
    'd': 'JEROBOAM',
    4.5: 'REHOBOAM',
    'q': 'BORDEAUX_JEROBOAM',
    'f': 'MATHUSALEM',
    9: 'SALMANAZAR',
    12: 'BALTHAZAR',
    15: 'NEBUCHADNEZZAR',
    18: 'MELCHIOR',
    20: 'SOLOMON',
    25: 'SOVEREIGN',
    27: 'GOLIATH',
    30: 'MELCHIZEDEK'
}

In [85]:
# Enoweb structure, so exploit that to extract the other details
for idx, wine in df1.iterrows():
    # extract wine, winery name, and size from full_details
    full_details_tokens = wine['full_details'].split(',')
    df.at[idx, 'winery_name'] = full_details_tokens[0]
    df.at[idx, 'name'] = full_details_tokens[1]
    
    # extract vintage and size from external_id
    external_id_tokens = wine['external_id'].split('.')
    df.at[idx, 'vintage'] = external_id_tokens[2]
    df.at[idx, 'size'] = sizes_map[external_id_tokens[3]]


In [86]:
df = fill_empty(df, VColumns.v2(), False)
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,v.69999.2020.c,WHITE,Cor unum et anima una,Abbazia di Novacella,,MAGNUM,2020,11400,0,200,,,True
1,v.13842.2022.b,WHITE,Grüner Veltliner Praepositus,Abbazia di Novacella,,BOTTLE,2022,3500,0,200,,,True
2,v.13841.2023.b,WHITE,Kerner Praepositus,Abbazia di Novacella,,BOTTLE,2023,3500,0,200,,,True
3,v.6142.2022.b,WHITE,Riesling Praepositus,Abbazia di Novacella,,BOTTLE,2022,3700,0,200,,,True
4,v.100767.2019.b,WHITE,Riesling Ungeheuer,Acham-Magin,,BOTTLE,2019,9100,0,200,,,True


# Merge files and write output

If there are multiple files, merge them into one and create single output file.

In [87]:
df.head()

df_out = pd.concat([df], ignore_index=True)
df_out.to_csv("v2-cleaned.csv", index=False)

bool()

False