In [197]:
import sys
sys.path.append('../../')
import os

import pandas as pd
from utils import fill_empty, VColumns
from dotenv import load_dotenv
load_dotenv()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load input file

In [198]:
df1 = pd.read_csv("v1-start.csv")
df1.head()

Unnamed: 0,storage_area,type,size,name,information,dosage,winery_name,vintage,eur_purchase_price,quantity,other_vintages,visible,unclear,eur_sales_price,glass_price,internal_notes
0,4,,,RED MOON,,,,,,5.0,,1,,,,
1,5,,,Cuvée Marianna,,Extra Brut,Arunda Vivaldi,,22.5,3.0,,1,,73.0,12.0,
2,6,,,Arunda Riserva Millesimato,,Extra Brut,Arunda Vivaldi,2016.0,25.0,2.0,2017-1,1,,81.0,13.0,
3,7,,,Arunda Rosé,,Brut,Arunda Vivaldi,,20.5,,,1,,67.0,11.0,
4,8,,,Comitissa Riserva,,Pas Dosé,Lorenz Martini,2018.0,22.5,2.0,2019-2,1,,73.0,12.0,


### Split vintages into multiple wines

In [199]:
import re
regex = r"\d{4}-\d{1,2}"

# read column `other_vintages` and if there are other vintages specified, copy the line and add the other vintages
for i, row in df1.iterrows():
    if pd.notna(row['other_vintages']):
        # read vintages that match the regex and add them to the dataframe
        other_vintages = [v for v in row['other_vintages'] if re.match(regex, v)]
        for v in other_vintages:
            new_row = row.copy()
            new_row['vintage'] = v
            new_row['visible'] = False
            df1 = df1.append(new_row)

# Start elaborating columns

Create a new dataframe with empty values, so that we can start filling in values from the input dataframe, without overriding columns.

In [200]:
df1.head()

Unnamed: 0,storage_area,type,size,name,information,dosage,winery_name,vintage,eur_purchase_price,quantity,other_vintages,visible,unclear,eur_sales_price,glass_price,internal_notes
0,4,,,RED MOON,,,,,,5.0,,1,,,,
1,5,,,Cuvée Marianna,,Extra Brut,Arunda Vivaldi,,22.5,3.0,,1,,73.0,12.0,
2,6,,,Arunda Riserva Millesimato,,Extra Brut,Arunda Vivaldi,2016.0,25.0,2.0,2017-1,1,,81.0,13.0,
3,7,,,Arunda Rosé,,Brut,Arunda Vivaldi,,20.5,,,1,,67.0,11.0,
4,8,,,Comitissa Riserva,,Pas Dosé,Lorenz Martini,2018.0,22.5,2.0,2019-2,1,,73.0,12.0,


In [201]:
# create new empty dataframe
df = pd.DataFrame(columns=VColumns.v2())
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes


In [202]:
# copy values from original columns to new columns
df['external_id'] = df1['storage_area']
df['type'] = df1['type']
df['name'] = df1['name'] + " " + df1['dosage'].fillna("")
df['winery_name'] = df1['winery_name']
df["vintage"] = df1["vintage"]
df["quantity"] = df1["quantity"]

# fill to 4 digits
df["storage_area"] = df1["storage_area"].apply(lambda x: str(x).zfill(4))
df["internal_notes"] = df1['glass_price'].apply(lambda x: f"Glas Preis: {x}\n\n" if not pd.isnull(x) else "") + df1['internal_notes'].fillna("")
df["visible"] = df1["visible"]

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,4,,RED MOON,,,,,,,5.0,4,,1
1,5,,Cuvée Marianna Extra Brut,Arunda Vivaldi,,,,,,3.0,5,Glas Preis: 12.0\n\n,1
2,6,,Arunda Riserva Millesimato Extra Brut,Arunda Vivaldi,,,2016.0,,,2.0,6,Glas Preis: 13.0\n\n,1
3,7,,Arunda Rosé Brut,Arunda Vivaldi,,,,,,,7,Glas Preis: 11.0\n\n,1
4,8,,Comitissa Riserva Pas Dosé,Lorenz Martini,,,2018.0,,,2.0,8,Glas Preis: 12.0\n\n,1


In [203]:
# size
df['size'] = df1["size"].fillna("BOTTLE")

# convert prices to cents
df['price'] = df1['eur_sales_price'].apply(lambda x: int(x*100) if not pd.isnull(x) else 0) # convert to cents
df['purchase_price'] = df1['eur_purchase_price'].apply(lambda x: int(x*100) if not pd.isnull(x) else 0) # convert to cents

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,4,,RED MOON,,,BOTTLE,,0,0,5.0,4,,1
1,5,,Cuvée Marianna Extra Brut,Arunda Vivaldi,,BOTTLE,,7300,2250,3.0,5,Glas Preis: 12.0\n\n,1
2,6,,Arunda Riserva Millesimato Extra Brut,Arunda Vivaldi,,BOTTLE,2016.0,8100,2500,2.0,6,Glas Preis: 13.0\n\n,1
3,7,,Arunda Rosé Brut,Arunda Vivaldi,,BOTTLE,,6700,2050,,7,Glas Preis: 11.0\n\n,1
4,8,,Comitissa Riserva Pas Dosé,Lorenz Martini,,BOTTLE,2018.0,7300,2250,2.0,8,Glas Preis: 12.0\n\n,1


In [204]:
df = fill_empty(df, VColumns.v2())
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,4,,RED MOON,,,BOTTLE,0,0,0,5,4,,1
1,5,,Cuvée Marianna Extra Brut,Arunda Vivaldi,,BOTTLE,0,7300,2250,3,5,Glas Preis: 12.0\n\n,1
2,6,,Arunda Riserva Millesimato Extra Brut,Arunda Vivaldi,,BOTTLE,2016,8100,2500,2,6,Glas Preis: 13.0\n\n,1
3,7,,Arunda Rosé Brut,Arunda Vivaldi,,BOTTLE,0,6700,2050,0,7,Glas Preis: 11.0\n\n,1
4,8,,Comitissa Riserva Pas Dosé,Lorenz Martini,,BOTTLE,2018,7300,2250,2,8,Glas Preis: 12.0\n\n,1


# Merge files and write output

If there are multiple files, merge them into one and create single output file.

In [205]:
df.head()

df_out = pd.concat([df], ignore_index=True)
df_out.to_csv("v2-cleaned.csv", index=False)