In [20]:
# Generate demo CSV if not exists
import os
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

DATA_DIR_RAW = os.getenv("DATA_DIR_RAW")
DATA_DIR_PROCESSED = os.getenv("DATA_DIR_PROCESSED")

csv_path = 'data/raw/data-csv.csv'
os.makedirs('data/raw', exist_ok=True)

df = pd.DataFrame({
        'price': [100, 200, 150, None, 250],
        'date': ['2025-08-01','2025-08-02',None,'2025-08-04','2025-08-05'],
        'category': ['Electronics','Furniture','Toys','Clothing',None]
    })

if not os.path.exists(csv_path):
    df_demo = pd.DataFrame({
        'price': [100, 200, 150, None, 250],
        'date': ['2025-08-01','2025-08-02',None,'2025-08-04','2025-08-05'],
        'category': ['Electronics','Furniture','Toys','Clothing',None]
    })
    df_demo.to_csv(csv_path, index=False)
    print(f"Demo CSV created at {csv_path}")
else:
    print(f"CSV already exists at {csv_path}")

CSV already exists at data/raw/data-csv.csv


In [21]:
parquet_path = 'data/processed/data-parquet.parquet'
os.makedirs('data/processed', exist_ok=True)

if not os.path.exists(parquet_path):
    df_demo = pd.DataFrame({
        'price': [100, 200, 150, None, 250],
        'date': ['2025-08-01','2025-08-02',None,'2025-08-04','2025-08-05'],
        'category': ['Electronics','Furniture','Toys','Clothing',None]
    })
    df_demo['date'] = pd.to_datetime(df_demo['date'])
    df_demo.to_parquet(parquet_path, index=False)
    print(f"Demo Parquet created at {parquet_path}")
else:
    print(f"Parquet already exists at {parquet_path}")

Demo Parquet created at data/processed/data-parquet.parquet


In [22]:
def validate_loaded(original: pd.DataFrame, reloaded: pd.DataFrame, cols=('date','category','price')):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'cols_present': all(c in reloaded.columns for c in cols)
    }
    # dtype sanity checks
    if 'price' in reloaded.columns:
        checks['price_is_numeric'] = pd.api.types.is_numeric_dtype(reloaded['price'])
    if 'date' in reloaded.columns:
        checks['date_is_datetime'] = pd.api.types.is_datetime64_any_dtype(reloaded['date'])
    return checks

df_csv = pd.read_csv(csv_path, parse_dates=['date'])
print('CSV validation:', validate_loaded(df, df_csv))
df_parq = pd.read_parquet(parquet_path)
print('Parquet validation:', validate_loaded(df, df_parq))

CSV validation: {'shape_equal': True, 'cols_present': True, 'price_is_numeric': True, 'date_is_datetime': True}
Parquet validation: {'shape_equal': True, 'cols_present': True, 'price_is_numeric': True, 'date_is_datetime': True}


In [None]:
def detect_format(path: Union[str, pathlib.Path]):
    suf = str(path).lower()
    if suf.endswith('.csv'): return 'csv'
    if suf.endswith('.parquet') or suf.endswith('.pq') or suf.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format for: ' + str(path))

def write_df(df, path):
    path = pathlib.Path(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        df.to_csv(path, index=False)
    elif fmt == 'parquet':
        df.to_parquet(path)
    return path

def read_df(path):
    path = pathlib.Path(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        return pd.read_csv(path, parse_dates=['date']) if 'date' in pd.read_csv(path, nrows=0).columns else pd.read_csv(path)
    elif fmt == 'parquet':
        return pd.read_parquet(path)