In [105]:
import sys
sys.path.append('../../')
import os

import pandas as pd
from utils import fill_empty, VColumns
from dotenv import load_dotenv
load_dotenv()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load input file

In [106]:
df1 = pd.read_csv("v1-start.csv")

In [107]:
# Drop rows that do not contain euro symbol
df1 = df1[df1['input'].str.contains('€')]

# Replace tabs with spaces
df1['input'] = df1['input'].str.replace('\t', ' ')

In [108]:
import re

pattern = r'(?:(?P<vintage>\d{4}|\d{2}/\d{2})\s+)?(?P<name>.+?)\s+(?P<size>\d+,\d+)\s+(?P<price>\d+,\d{2})\s*€?'

for idx, row in df1.iterrows():
    match = re.search(pattern, row['input'])
    if match:
        df1.loc[idx, 'name'] = match.group('name')
        df1.loc[idx, 'size'] = match.group('size')
        df1.loc[idx, 'price'] = match.group('price')
        df1.loc[idx, 'vintage'] = match.group('vintage')

In [109]:
# Function to split vintage ranges and create new rows
def split_vintage_range(row):
    if isinstance(row['vintage'], str) and '/' in row['vintage']:
        # Get the two vintages
        vintage1, vintage2 = row['vintage'].split('/')

        if vintage1 == vintage2:
            return [row]
        
        # Create two rows with different vintages
        row1 = row.copy()
        row2 = row.copy()
        
        row1['vintage'] = '20' + vintage1
        row2['vintage'] = '20' + vintage2
        
        return [row1, row2]
    return [row]

# Apply the splitting function and explode the dataframe
df_expanded = pd.DataFrame([
    new_row for _, row in df1.iterrows() 
    for new_row in split_vintage_range(row)
])

# Reset index after expansion
df_expanded = df_expanded.reset_index(drop=True)

df1 = df_expanded

# Start elaborating columns

Create a new dataframe with empty values, so that we can start filling in values from the input dataframe, without overriding columns.

In [110]:
df1.head()

Unnamed: 0,input,name,size,price,vintage
0,"Brut Arunda 0,75 58,00 €",Brut Arunda,75,5800,
1,"Brut Arunda 1,5 125,00 €",Brut Arunda,15,12500,
2,"Extra Brut Cuvée Marianna Arunda 0,75 71,00 €",Extra Brut Cuvée Marianna Arunda,75,7100,
3,"Extra Brut Cuvée Marianna Arunda 1,5 145,00 €",Extra Brut Cuvée Marianna Arunda,15,14500,
4,"Brut Rosé Arunda 0,75 64,00 €",Brut Rosé Arunda,75,6400,


In [111]:
# create new empty dataframe
df = pd.DataFrame(columns=VColumns.v2())
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible


In [113]:

# copy values from original columns to new columns
df['external_id'] = None
df['type'] = None
df['name'] = df1['name']
df['winery_name'] = None
df["vintage"] = df1["vintage"]
df["quantity"] = 50
df["internal_notes"] = None
df["visible"] = True

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,,,Brut Arunda,,,,,,,50,,,True
1,,,Brut Arunda,,,,,,,50,,,True
2,,,Extra Brut Cuvée Marianna Arunda,,,,,,,50,,,True
3,,,Extra Brut Cuvée Marianna Arunda,,,,,,,50,,,True
4,,,Brut Rosé Arunda,,,,,,,50,,,True


In [128]:
print(df1['size'].unique())

# size
map_size = {
    '0,75': 'BOTTLE',
    '0,375': 'HALF_BOTTLE',
    '0,5': 'HALF_LITER',
    '1,5': 'MAGNUM',
    '3,0': 'JEROBOAM',
    '5,0': 'BORDEAUX_JEROBOAM'
}

df['size'] = df1["size"].map(map_size).fillna("BOTTLE")

print(df['size'].unique())


['0,75' '1,5' nan '3,0' '0,375' '5,0' '0,5']
['BOTTLE' 'MAGNUM' 'JEROBOAM' 'HALF_BOTTLE' 'BORDEAUX_JEROBOAM'
 'HALF_LITER']


In [131]:
# size

# fill empty prices with 0
df1['price_eur'] = df1['price'].fillna(0)
df1['purchase_price_eur'] = 0

# convert prices to cents
df['price'] = df1['price_eur'].apply(lambda x: int(float(str(x).replace('€', '').replace(',', '.'))*100)) # convert to cents

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,,,Brut Arunda,,,BOTTLE,,5800,,50,,,True
1,,,Brut Arunda,,,MAGNUM,,12500,,50,,,True
2,,,Extra Brut Cuvée Marianna Arunda,,,BOTTLE,,7100,,50,,,True
3,,,Extra Brut Cuvée Marianna Arunda,,,MAGNUM,,14500,,50,,,True
4,,,Brut Rosé Arunda,,,BOTTLE,,6400,,50,,,True


In [132]:
df = fill_empty(df, VColumns.v2(), False)
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,,,Brut Arunda,,,BOTTLE,0,5800,0,50,,,True
1,,,Brut Arunda,,,MAGNUM,0,12500,0,50,,,True
2,,,Extra Brut Cuvée Marianna Arunda,,,BOTTLE,0,7100,0,50,,,True
3,,,Extra Brut Cuvée Marianna Arunda,,,MAGNUM,0,14500,0,50,,,True
4,,,Brut Rosé Arunda,,,BOTTLE,0,6400,0,50,,,True


# Merge files and write output

If there are multiple files, merge them into one and create single output file.

In [133]:
df.head()

df_out = pd.concat([df], ignore_index=True)
df_out.to_csv("v2-cleaned.csv", index=False)

bool()

False