In [1]:
import sys
sys.path.append('../../')
import os

import pandas as pd
from utils import fill_empty, VColumns
from dotenv import load_dotenv
load_dotenv()

%load_ext autoreload
%autoreload 2

# Load input file

In [2]:
df1 = pd.read_csv("v1-start.csv")

In [9]:
# Function to split vintage ranges and create new rows
def split_vintage_range(row):
    if isinstance(row['vintage'], str) and '/' in row['vintage']:
        # Get the two vintages
        vintage1, vintage2 = row['vintage'].split('/')

        if vintage1 == vintage2:
            return [row]
        
        # Create two rows with different vintages
        row1 = row.copy()
        row2 = row.copy()
        
        row1['vintage'] = vintage1
        row2['vintage'] = vintage2
        
        return [row1, row2]
    return [row]

# Apply the splitting function and explode the dataframe
df_expanded = pd.DataFrame([
    new_row for _, row in df1.iterrows() 
    for new_row in split_vintage_range(row)
])

# Reset index after expansion
df_expanded = df_expanded.reset_index(drop=True)

df1 = df_expanded

# Start elaborating columns

Create a new dataframe with empty values, so that we can start filling in values from the input dataframe, without overriding columns.

In [10]:
df1.head(8)

Unnamed: 0,name,winery,vintage,size,sales_price_eur,purchase_price,storage_area
0,Piris Prosecco (Extra Dry),Sacchetto,/,0.75,€ 39.00,€ 4.22,Hauptlager
1,Spumante „Bella Glamour Zero Zero“,Iris Vigneti,/,0.75,€ 31.00,€ 5.50,Hauptlager
2,Sekt Brut „Praeclarus“,Kellerei St. Pauls,/,0.75,€ 46.00,€ 14.90,Hauptlager
3,Sekt Brut Rosé,Arunda - Reiterer,/,0.75,€ 53.00,€ 20.50,Hauptlager
4,Sekt Extra Brut Riserva,Arunda - Reiterer,/,0.75,€ 59.00,€ 22.50,Hauptlager
5,Brut Riserva Methius,Dorigati,2015,0.75,€ 69.00,€ 27.80,Hauptlager
6,Brut Riserva Methius,Dorigati,2016,0.75,€ 69.00,€ 27.80,Hauptlager
7,Prosecco Brut Rosé Bio,Corvezzo,/,0.75,€ 38.00,€ 5.95,Hauptlager


In [11]:
# create new empty dataframe
df = pd.DataFrame(columns=VColumns.v2())
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible


In [14]:
import re

# copy values from original columns to new columns
df['external_id'] = None
df['type'] = None
df['name'] = df1['name']
df['winery_name'] = df1['winery']
# First extract vintage from name
df["vintage"] = df1["vintage"].apply(lambda x: int(re.search(r'\d{4}', x).group()) if re.search(r'\d{4}', x) else 0 if x == '/' else None)
df["name"] = df1["name"]
df["quantity"] = 0
df["storage_area"] = df1["storage_area"]
df["visible"] = True

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
1,,SPARKLING,Champagne Venus Brut Grand Cru,AGRAPART,,,2017.0,,,0,,Chardonnay 100%,True
2,,SPARKLING,Champagne Avizoise Extra Brut Granc Cru,AGRAPART,,,,,,0,,Chardonnay 100%,True
3,,SPARKLING,Champagne Terroirs Extra Brut Grand Cru,AGRAPART,,,,,,0,,Chardonnay 100%,True
4,,SPARKLING,Champagne 7 Crus Extra Brut,AGRAPART,,,,,,0,,"Chardonnay 90%, Pinot Noir 10%",True
5,,SPARKLING,Champagne Mineral Extra Brut Blanc de Blancs,AGRAPART,,,,,,0,,Chardonnay 100%,True


In [16]:
# size
df['size'] = df1["size"].fillna("BOTTLE")

# convert prices to cents
df['price'] = df1['eur_price']
# fill empty prices with 0
df1['eur_price'] = df1['eur_price'].fillna(0)

# convert prices to cents
df['price'] = df1['eur_price'].apply(lambda x: int(float(x)*100)) # convert to cents

df.head()

ValueError: cannot convert float NaN to integer

In [57]:
df = fill_empty(df, VColumns.v2(), False)
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible,takeaway_price
0,10110.0,SPARKLING,"Prosecco di Valdobbiadene, Extra Brut",Bortolin,,BOTTLE,0,3500,800,50,,Holzer,True,1300
1,10116.0,SPARKLING,Arunda Brut Rosè Exellor,Arunda,,BOTTLE,0,5400,2500,50,,Karadar,True,3950
2,10117.0,SPARKLING,Hausmannhof Reserve 2013,Haderburg,,BOTTLE,2013,6900,3500,50,,direkt,True,5550
3,10118.0,SPARKLING,Solera Extra Brut,Marco Buvoli,,BOTTLE,0,7500,2900,50,,,True,4600
4,10119.0,SPARKLING,Haderburg Spumante Brut,Haderburg,,BOTTLE,0,3900,1600,50,,direkt,True,2860


# Merge files and write output

If there are multiple files, merge them into one and create single output file.

In [58]:
df.head()

df_out = pd.concat([df], ignore_index=True)
df_out.to_csv("v2-cleaned.csv", index=False)

bool()

False