## Notes
---
### SCD Type 2 Triggers (new row):
- `supplier_name`
- `supplier_category`
- `city`
- `delivery_frequency`
- `email`

> Create new row, assign new `supplier_key`, mark `is_active` as `True`, mark old row's `is_active` as `False`

### SCD Type 1 Triggers (overwrite):
- `notes`
- `is_active`

> Just overwrite these bro
---

# Extract

In [None]:
# load data/day1/supplier_details.csv and suppliers.csv into pandas
import pandas as pd

details = pd.read_csv("../data/day1/supplier_details.csv")
suppliers = pd.read_csv("../data/day1/suppliers.csv")

# just to make the day mapping a bit more dynamic
def get_days_mapping(day: str):
    days_mapping = {
        # skipping day 1 since we don't really know the actual source day
        'day2' : '2025-01-01',
        'day3' : '2025-01-02'
    }
    return days_mapping[day]

# Clean

In [None]:
def clean(df_supp, df_det):
    # strip whitespace from strings
    df_supp = df_supp.map(lambda x: x.strip() if isinstance (x, str) else x)
    df_det = df_det.map(lambda x: x.strip() if isinstance (x, str) else x)
    # lowercase all emails
    df_det["Email"] = df_det["Email"].str.lower()

    # suppliers.dtypes already shows 'is_active' as bool

    return df_supp, df_det

suppliers, details = clean(suppliers, details)
print("suppliers:",suppliers.head(3))
print("\n" + "-"*90)
print("details:",details.head(3))
print(f"{"|"*35} [Data Types for Reference] {"|"*35}")
print(f"{suppliers.dtypes} \n\n\n {details.dtypes}")


# Staging
---
### Notes:
- first day has only 8 entreis anyway
- `source_date` for day 1 will be null since we don't *actually know* the source date.
  - day 2 will have a source date of '2025-01-01'
  - day 3 will have a source date of '2025-01-02'

In [None]:

def stage(df_supp, df_det):
    # merge the two dataframes based on the supplier_id
    df_staging = pd.concat([df_supp, df_det], axis=1, join="inner")

    # 
    df_staging = df_staging.rename(columns={
        'SupplierID': 'supplier_id',
        'SupplierName': 'supplier_name',
        'SupplierCategory': 'supplier_category',
        'City': 'city',
        'IsActive': "is_active",
        'Email': 'email',
        'DeliveryFrequency': 'delivery_frequency',
        'Notes': 'notes'
    })

    return df_staging

df_staging = stage(suppliers, details)
df_staging.head(8)