In [204]:
import sys
sys.path.append('../../')
import os

import pandas as pd
from utils import fill_empty, VColumns
from dotenv import load_dotenv
load_dotenv()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load input file

In [205]:
df1 = pd.read_csv("v1-start.csv")

In [206]:
# Fill in prefix for rows where it's empty by using previous non-empty prefix
current_prefix = None
for idx, row in df1.iterrows():
    if pd.notna(row['prefix']) and row['prefix'] != '':
        current_prefix = row['prefix']
    else:
        df1.at[idx, 'prefix'] = current_prefix

# Drop rows where name is empty
df1 = df1.dropna(subset=['name'])
# Also drop rows where name is empty string
df1 = df1[df1['name'] != '']


In [207]:
# Rows to drop
# add rows where vintage is not 0 or a valid year (year can be a string or an int)
# Function to check if vintage is valid (0 or a valid year between 1900-2023)
def is_valid_vintage(vintage):
    try:
        # string to int first to handle both int and float inputs
        vintage = int(vintage)
        if vintage == 0:
            return True
        return 1900 <= vintage <= 2024
    except:
        return False

# Get rows where vintage is invalid
rows_to_drop = df1[~df1['vintage'].apply(is_valid_vintage)]

# store dropped rows in csv
rows_to_drop.to_csv("v2-dropped.csv", index=False)

# remove from df1
df1 = df1[df1['vintage'].apply(is_valid_vintage)]


# Start elaborating columns

Create a new dataframe with empty values, so that we can start filling in values from the input dataframe, without overriding columns.

In [208]:
df1.head()

Unnamed: 0,type,prefix,name,winery,internal_notes,size,vintage,ricavo,purchase_price,sales_price,qty
0,,Metodo Charmat,Ruio Brut,Malibran,,,0,28,5.85 €,35.00 €,276
1,,Metodo Charmat,Rose Brut,Malibran,,,0,33,5.85 €,40.00 €,58
2,,Metodo Charmat,Col Fondo Sottoriva,Malibran,,,0,28,5.85 €,35.00 €,14
3,,Metodo Charmat,Credamora,Malibran,,,2021,36,7 €,45.00 €,7
4,,Metodo Charmat,Col Fondo vintage,Malibran,,,2016,ö,0 €,NON IN LIST,3


In [209]:
# create new empty dataframe
df = pd.DataFrame(columns=VColumns.v2())
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible


In [210]:
import re

# copy values from original columns to new columns
df['external_id'] = None
df['type'] = df1['type']
df['name'] = df1['prefix'] + ' ' + df1['name']
df['winery_name'] = df1['winery']
# First extract vintage from name
df["vintage"] = df1["vintage"]
df["quantity"] = df1["qty"]
df["internal_notes"] = df1["internal_notes"]
df["visible"] = True

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,,,Metodo Charmat Ruio Brut,Malibran,,,0,,,276,,,True
1,,,Metodo Charmat Rose Brut,Malibran,,,0,,,58,,,True
2,,,Metodo Charmat Col Fondo Sottoriva,Malibran,,,0,,,14,,,True
3,,,Metodo Charmat Credamora,Malibran,,,2021,,,7,,,True
4,,,Metodo Charmat Col Fondo vintage,Malibran,,,2016,,,3,,,True


In [211]:
# rows with sales_price  that contains 'NON IN LIST' should set visible to false and price to 0
non_list_mask = df1['sales_price'].str.contains('NON', case=False, na=False)
df1.loc[non_list_mask, 'visible'] = False
df1.loc[non_list_mask, 'sales_price'] = 0

In [212]:
# size
df['size'] = df1["size"].fillna("BOTTLE")

# fill empty prices with 0
df1['price_eur'] = df1['sales_price'].fillna(0)
df1['purchase_price_eur'] = df1['purchase_price'].fillna(0)

# convert prices to cents
df['price'] = df1['price_eur'].apply(lambda x: int(float(str(x).replace('€', ''))*100)) # convert to cents
df['purchase_price'] = df1['purchase_price_eur'].apply(lambda x: int(float(str(x).replace('€', ''))*100)) # convert to cents

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,,,Metodo Charmat Ruio Brut,Malibran,,BOTTLE,0,3500,585,276,,,True
1,,,Metodo Charmat Rose Brut,Malibran,,BOTTLE,0,4000,585,58,,,True
2,,,Metodo Charmat Col Fondo Sottoriva,Malibran,,BOTTLE,0,3500,585,14,,,True
3,,,Metodo Charmat Credamora,Malibran,,BOTTLE,2021,4500,700,7,,,True
4,,,Metodo Charmat Col Fondo vintage,Malibran,,BOTTLE,2016,0,0,3,,,True


In [213]:
df = fill_empty(df, VColumns.v2(), False)
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,,,Metodo Charmat Ruio Brut,Malibran,,BOTTLE,0,3500,585,276,,,True
1,,,Metodo Charmat Rose Brut,Malibran,,BOTTLE,0,4000,585,58,,,True
2,,,Metodo Charmat Col Fondo Sottoriva,Malibran,,BOTTLE,0,3500,585,14,,,True
3,,,Metodo Charmat Credamora,Malibran,,BOTTLE,2021,4500,700,7,,,True
4,,,Metodo Charmat Col Fondo vintage,Malibran,,BOTTLE,2016,0,0,3,,,True


# Merge files and write output

If there are multiple files, merge them into one and create single output file.

In [214]:
df.head()

df_out = pd.concat([df], ignore_index=True)
df_out.to_csv("v2-cleaned.csv", index=False)

bool()

False