In [97]:
import sys
sys.path.append('../../')
import os

import pandas as pd
from utils import fill_empty, VColumns
from dotenv import load_dotenv
load_dotenv()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load input file

In [98]:
df1 = pd.read_csv("v1-start.csv")

# Start elaborating columns

Create a new dataframe with empty values, so that we can start filling in values from the input dataframe, without overriding columns.

In [99]:
df1.head()

Unnamed: 0,external_id,type,full_name,Unnamed: 3,MAGNUM,Unnamed: 5,BOTTLE,HALF_LITER,HALF_BOTTLE
0,1,,Pinot bianco DeSilva DOC ´23,,,,29.0,,
1,2,,"SAUVIGNON BLANC De Silva, Peter Sölva ´22",,,,31.0,,
2,3,,"PINOT GRIGIO De Silva, Peter Sölva ´23",,,,29.0,,
3,4,,"GEWÜRZTRAMINER De Silva, Peter Sölva ´23",,,,31.0,,
4,5,,"Il Secondo, cuvèe bianco WB-CH-SV",,,,35.0,,


In [100]:
# create new empty dataframe
df = pd.DataFrame(columns=VColumns.v2())
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible


In [101]:
# extract vintage from name (last 2 digits at end of string)
df1['vintage'] = df1['full_name'].str.extract(r'(\d{2}\s?)$')

# print lines where df['vintage'] is None, so that I can check if they are correct
print(df1[df1['vintage'].isna()]['full_name'])

df.head()

4                    Il Secondo, cuvèe bianco WB-CH-SV 
74    EXCELLOR Brut Rose Metodo Classico Sektkellere...
88                  Goldmuskateller, Weingut Dominikus 
Name: full_name, dtype: object


Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible


In [102]:
# copy values from original columns to new columns
df['external_id'] = df1['external_id']
df['type'] = df1['type']
df['name'] = df1.apply(lambda x: x['full_name'].replace(x['vintage'], '') if pd.notna(x['vintage']) else x['full_name'], axis=1)
df['winery_name'] = None
df["vintage"] = df1["vintage"].apply(lambda x: '20' + str(x) if pd.notna(x) else x)
df["quantity"] = 20
df["internal_notes"] = None
df["visible"] = True

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,1,,Pinot bianco DeSilva DOC ´,,,,2023.0,,,20,,,True
1,2,,"SAUVIGNON BLANC De Silva, Peter Sölva ´",,,,2022.0,,,20,,,True
2,3,,"PINOT GRIGIO De Silva, Peter Sölva ´",,,,2023.0,,,20,,,True
3,4,,"GEWÜRZTRAMINER De Silva, Peter Sölva ´",,,,2023.0,,,20,,,True
4,5,,"Il Secondo, cuvèe bianco WB-CH-SV",,,,,,,20,,,True


In [103]:
# size

# fill empty prices with 0
df1['purchase_price_eur'] = 0

# Map sizes based on non-NA columns
def get_size_and_price(row):
    if pd.notna(row['MAGNUM']):
        return 'MAGNUM', row['MAGNUM']
    elif pd.notna(row['BOTTLE']):
        return 'BOTTLE', row['BOTTLE'] 
    elif pd.notna(row['HALF_LITER']):
        return 'HALF_LITER', row['HALF_LITER']
    elif pd.notna(row['HALF_BOTTLE']):
        return 'HALF_BOTTLE', row['HALF_BOTTLE']
    return None, 0

# Apply the mapping function
df1[['size', 'price_eur']] = df1.apply(get_size_and_price, axis=1, result_type='expand')
df['size'] = df1['size']
df['price'] = df1['price_eur'].apply(lambda x: int(float(str(x).replace('€', '').replace(',', '.'))*100)) # convert to cents

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,1,,Pinot bianco DeSilva DOC ´,,,BOTTLE,2023.0,2900,,20,,,True
1,2,,"SAUVIGNON BLANC De Silva, Peter Sölva ´",,,BOTTLE,2022.0,3100,,20,,,True
2,3,,"PINOT GRIGIO De Silva, Peter Sölva ´",,,BOTTLE,2023.0,2900,,20,,,True
3,4,,"GEWÜRZTRAMINER De Silva, Peter Sölva ´",,,BOTTLE,2023.0,3100,,20,,,True
4,5,,"Il Secondo, cuvèe bianco WB-CH-SV",,,BOTTLE,,3500,,20,,,True


In [104]:
df = fill_empty(df, VColumns.v2(), False)
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,1,,Pinot bianco DeSilva DOC ´,,,BOTTLE,2023,2900,0,20,,,True
1,2,,"SAUVIGNON BLANC De Silva, Peter Sölva ´",,,BOTTLE,2022,3100,0,20,,,True
2,3,,"PINOT GRIGIO De Silva, Peter Sölva ´",,,BOTTLE,2023,2900,0,20,,,True
3,4,,"GEWÜRZTRAMINER De Silva, Peter Sölva ´",,,BOTTLE,2023,3100,0,20,,,True
4,5,,"Il Secondo, cuvèe bianco WB-CH-SV",,,BOTTLE,0,3500,0,20,,,True


# Merge files and write output

If there are multiple files, merge them into one and create single output file.

In [105]:
df.head()

df_out = pd.concat([df], ignore_index=True)
df_out.to_csv("v2-cleaned.csv", index=False)

bool()

False