In [116]:
import sys
sys.path.append('../../')
import os

import pandas as pd
from utils import fill_empty, VColumns
from dotenv import load_dotenv
load_dotenv()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load input file

In [117]:
df1 = pd.read_csv("v1-start.csv")

# Start elaborating columns

Create a new dataframe with empty values, so that we can start filling in values from the input dataframe, without overriding columns.

In [118]:
# create new empty dataframe
df = pd.DataFrame(columns=VColumns.v2())
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible


In [None]:
# copy values from original columns to new columns
df['external_id'] = df1['external_id']
df['type'] = df1['type']
df['name'] = df1['name'].astype(str) + " " + df1['style'].fillna('').astype(str) # concatenate name and style
df['winery_name'] = df1['winery_name']

# Extract 4-digit vintage (e.g. 2019) from name by:
# 1. Getting all digits from name string
# 2. Taking last 4 digits if there are at least 4 digits
# 3. Converting to int if valid vintage found, otherwise None
df["vintage"] = df1.apply(lambda x: int(''.join(c for c in str(x["name"]) if c.isdigit())[-4:]) if pd.notna(x["name"]) and any(c.isdigit() for c in str(x["name"])) and len(''.join(c for c in str(x["name"]) if c.isdigit())) >= 4 else None, axis=1)
df["quantity"] = df1["qty"]
df["storage_area"] = None
df["info"] = df1["info"]
df["internal_notes"] = None
df["visible"] = True

df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,300001.0,,rose NV,Bollinger,,,,,,1.0,,,True
1,300002.0,,"Grande Cuvée (PN, CH, PM) NV",Krug,,,,,,2.0,,,True
2,,,~,,,,,,,,,,True
3,300500.0,,Special Cuvee NV,Bollinger,,,,,,24.0,,,True
4,300501.0,,Special Cuvee NV,Bollinger,,,,,,18.0,,,True


In [120]:
print("Before removing empty lines: ", len(df))

# Drop empty lines
df = df[df['name'].fillna('').str.len() + df['winery_name'].fillna('').str.len() > 5]

print("After removing empty lines: ", len(df))

Before removing empty lines:  3411
After removing empty lines:  3071


In [121]:
# Map sizes 
print(df1['size'].unique())

map_sizes = {
    0.375: 'HALF_BOTTLE',
    0.75: 'BOTTLE',
    0.72: 'BOTTLE',
    0.7: 'BOTTLE',
    1.5: 'MAGNUM',
    3: 'JEROBOAM',
    5: 'BORDEAUX_JEROBOAM',
}

df['size'] = df1['size'].apply(lambda x: map_sizes[x] if x in map_sizes else 'BOTTLE')

print(df['size'].unique())


[0.375   nan 0.75  1.5   3.    5.    0.72  0.5   0.7  ]
['HALF_BOTTLE' 'BOTTLE' 'MAGNUM' 'JEROBOAM' 'BORDEAUX_JEROBOAM']


In [122]:
def convert_price_to_cents(x):
    if pd.isna(x) or str(x).strip() == '':
        return 0
    try:
        return int(float(str(x).replace('€', '').strip()) * 100)
    except (ValueError, TypeError):
        return 0

df1['sales_price_eur'] = df1['sales_price_eur'].fillna('')
df['price'] = df1['sales_price_eur'].apply(convert_price_to_cents)

df1['purchase_price_eur'] = df1['purchase_price_eur'].fillna('')
df['purchase_price'] = df1['purchase_price_eur'].apply(convert_price_to_cents)

In [123]:
df = fill_empty(df, VColumns.v2(), False)
df.head()

Unnamed: 0,external_id,type,name,winery_name,info,size,vintage,price,purchase_price,quantity,storage_area,internal_notes,visible
0,300001.0,,rose NV,Bollinger,,HALF_BOTTLE,0,5500,2990,1,,,True
1,300002.0,,"Grande Cuvée (PN, CH, PM) NV",Krug,,HALF_BOTTLE,0,19000,9990,2,,,True
3,300500.0,,Special Cuvee NV,Bollinger,,BOTTLE,0,9500,3525,24,,,True
4,300501.0,,Special Cuvee NV,Bollinger,,MAGNUM,0,19000,8250,18,,,True
5,300502.0,,Special Cuvee NV rote Holzbox,Bollinger,,JEROBOAM,0,39500,18750,1,,,True


# Merge files and write output

If there are multiple files, merge them into one and create single output file.

In [124]:
df.head()

df_out = pd.concat([df], ignore_index=True)
df_out.to_csv("v2-cleaned.csv", index=False)

bool()

False