In [34]:
import sys
sys.path.append('../../')
import os

import pandas as pd
from utils import fill_empty, VColumns
from dotenv import load_dotenv
load_dotenv()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load input file

In [35]:
df1 = pd.read_csv("v1-start.csv")

# Start elaborating columns

Create a new dataframe with empty values, so that we can start filling in values from the input dataframe, without overriding columns.

In [36]:
# drop lines with empty winery
df1 = df1[df1['winery'].notna()]

df1.head()

Unnamed: 0,winery,name,Unnamed: 2,vintage,internal_notes_1,Unnamed: 5,internal_notes_2,info,price
2,Edouard Duval,BRUT D´EUALIE - MAGNUM - 48 mesi,,s.a,"1,50 x 3 / S",12.00%,Apr-24,,€ 89.90
3,Edouard Duval,BRUT D´EULALIE - DOPPIOMAG. - 48 mesi,,s.a,"3,00 x 1 / S",12.00%,Sep-24,,€ 210.00
4,Edouard Duval,BRUT D´EULALIE - MATHUSALEM - 48 mesi,,s.a,"6,00 x 1 / S",12.00%,Jul-23,,€ 530.00
5,Edouard Duval,NOIR D`EULALIE BRUT NATURE - 72 mesi,,s.a.,"0,75 x 6 / S",12.00%,Mar-23,,"€ 49,90"
6,Edouard Duval,NOIR D`EULALIE EXTRA BRUT - 72 mesi,,s.a.,"0,75 x 6 / S",12.00%,May-23,,€ 47.90


In [37]:
# create new empty dataframe
df = pd.DataFrame(columns=VColumns.v2())
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible


In [38]:
# copy values from original columns to new columns
df['external_id'] = None
df['type'] = None
df['name'] = df1['name']
df['winery_name'] = df1['winery']
df["vintage"] = df1["vintage"].apply(lambda x: 0 if str(x).lower().strip() in ['s.a.', 's.a'] else x)
df["quantity"] = 50
df["info"] = df1["info"]
df["internal_notes"] = df1["internal_notes_1"] + " | " + df1["internal_notes_2"]
df["visible"] = True

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
2,,,BRUT D´EUALIE - MAGNUM - 48 mesi,Edouard Duval,,,0,,,50,,"1,50 x 3 / S | Apr-24",True
3,,,BRUT D´EULALIE - DOPPIOMAG. - 48 mesi,Edouard Duval,,,0,,,50,,"3,00 x 1 / S | Sep-24",True
4,,,BRUT D´EULALIE - MATHUSALEM - 48 mesi,Edouard Duval,,,0,,,50,,"6,00 x 1 / S | Jul-23",True
5,,,NOIR D`EULALIE BRUT NATURE - 72 mesi,Edouard Duval,,,0,,,50,,"0,75 x 6 / S | Mar-23",True
6,,,NOIR D`EULALIE EXTRA BRUT - 72 mesi,Edouard Duval,,,0,,,50,,"0,75 x 6 / S | May-23",True


In [39]:
# Extract size from internal_notes_1 by taking first token before space
df1["size"] = df1["internal_notes_1"].fillna("").apply(lambda x: x.split(" ")[0] if x else "missing")

print("\nUnique sizes:")
print(df1["size"].unique())

mapping = {
    "0,75": "BOTTLE",
    "0,75x": "BOTTLE",
    "0,62": "BOTTLE",
    "1,50": "MAGNUM",
    "1,5l": "MAGNUM", 
    "1,5": "MAGNUM",
    "3,00": "MATHUSALEM",
    "3,0": "MATHUSALEM",
    "6,00": "IMPERIAL",
    "6,0": "IMPERIAL",
    "6": "IMPERIAL",
    "0,375": "HALF_BOTTLE",
    "0,375x": "HALF_BOTTLE",
    "0,50": "HALF_BOTTLE",
}

df["size"] = df1["size"].map(mapping)

# print unique sizes
print(df["size"].unique())


Unique sizes:
['1,50' '3,00' '6,00' '0,75' '1,5l' '1,5' '0,375' '0,50' '6' '3,0' '6,0'
 '0,375x' '0,75x' '0,62']
['MAGNUM' 'MATHUSALEM' 'IMPERIAL' 'BOTTLE' 'HALF_BOTTLE']


In [40]:
df1['price'] = df1['price'].fillna(0)
df['price'] = df1['price'].apply(lambda x: int(float(str(x).replace('€', '').replace('*', '').replace(',', ''))*100)) # convert to cents

df['purchase_price'] = df['price'].apply(lambda x: int(x/1.5)) # convert to cents and divide by 1.5

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
2,,,BRUT D´EUALIE - MAGNUM - 48 mesi,Edouard Duval,,MAGNUM,0,8990,5993,50,,"1,50 x 3 / S | Apr-24",True
3,,,BRUT D´EULALIE - DOPPIOMAG. - 48 mesi,Edouard Duval,,MATHUSALEM,0,21000,14000,50,,"3,00 x 1 / S | Sep-24",True
4,,,BRUT D´EULALIE - MATHUSALEM - 48 mesi,Edouard Duval,,IMPERIAL,0,53000,35333,50,,"6,00 x 1 / S | Jul-23",True
5,,,NOIR D`EULALIE BRUT NATURE - 72 mesi,Edouard Duval,,BOTTLE,0,499000,332666,50,,"0,75 x 6 / S | Mar-23",True
6,,,NOIR D`EULALIE EXTRA BRUT - 72 mesi,Edouard Duval,,BOTTLE,0,4790,3193,50,,"0,75 x 6 / S | May-23",True


In [41]:
df = fill_empty(df, VColumns.v2(), False)
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
2,,,BRUT D´EUALIE - MAGNUM - 48 mesi,Edouard Duval,,MAGNUM,0,8990,5993,50,,"1,50 x 3 / S | Apr-24",True
3,,,BRUT D´EULALIE - DOPPIOMAG. - 48 mesi,Edouard Duval,,MATHUSALEM,0,21000,14000,50,,"3,00 x 1 / S | Sep-24",True
4,,,BRUT D´EULALIE - MATHUSALEM - 48 mesi,Edouard Duval,,IMPERIAL,0,53000,35333,50,,"6,00 x 1 / S | Jul-23",True
5,,,NOIR D`EULALIE BRUT NATURE - 72 mesi,Edouard Duval,,BOTTLE,0,499000,332666,50,,"0,75 x 6 / S | Mar-23",True
6,,,NOIR D`EULALIE EXTRA BRUT - 72 mesi,Edouard Duval,,BOTTLE,0,4790,3193,50,,"0,75 x 6 / S | May-23",True


# Merge files and write output

If there are multiple files, merge them into one and create single output file.

In [42]:
df.head()

df_out = pd.concat([df], ignore_index=True)
df_out.to_csv("v2-cleaned.csv", index=False)

bool()

False