## Notes
---
### SCD Type 2 Triggers (new row):
- `supplier_name`
- `supplier_category`
- `city`
- `delivery_frequency`
- `email`

> Create new row, assign new `supplier_key`, mark `is_active` as `True`, mark old row's `is_active` as `False`

### SCD Type 1 Triggers (overwrite):
- `notes`
- `is_active`

> Just overwrite these bro
---

# Extract

In [60]:
# load data/day1/supplier_details.csv and suppliers.csv into pandas
import pandas as pd
import os
from datetime import datetime

# Variable day (change this )
CURRENT_DAY = 'day2'

# Paths
DIM_SUPPLIERS_PATH = "dim_suppliers.csv"
DETAILS_PATH = f"../data/{CURRENT_DAY}/supplier_details.csv"
SUPPLIERS_PATH = f"../data/{CURRENT_DAY}/suppliers.csv"

# Triggers
TYPE2 = ["supplier_name","supplier_category","city","delivery_frequency","email"]
TYPE1 = ["notes","is_active"]

# Daily Dataframes
details = pd.read_csv(DETAILS_PATH)
suppliers = pd.read_csv(SUPPLIERS_PATH)

# Ensure that there is a dim_suppliers to work with
if os.path.exists(DIM_SUPPLIERS_PATH):
    dim_suppliers = pd.read_csv(DIM_SUPPLIERS_PATH) # probably still empty
else:
    dim_suppliers = pd.DataFrame() # start empty



# just to make the day mapping a bit more dynamic
def get_days_mapping(day: str):
    days_mapping = {
        # skipping day 1 since we don't really know the actual source day
        'day1' : '',
        'day2' : '2025-01-01',
        'day3' : '2025-01-02'
    }
    return days_mapping[day]

# Clean

In [61]:
def clean(df_supp, df_det):
    # strip whitespace from strings
    df_supp = df_supp.map(lambda x: x.strip() if isinstance (x, str) else x)
    df_det = df_det.map(lambda x: x.strip() if isinstance (x, str) else x)
    # lowercase all emails
    df_det["Email"] = df_det["Email"].str.lower()

    # suppliers.dtypes already shows 'is_active' as bool

    return df_supp, df_det

suppliers, details = clean(suppliers, details)
print("suppliers:",suppliers.head(3))
print("\n" + "-"*90)
print("details:",details.head(3))
print(f"{"|"*35} [Data Types for Reference] {"|"*35}")
print(f"{suppliers.dtypes} \n\n\n {details.dtypes}")


suppliers:    SupplierID          SupplierName SupplierCategory         City  IsActive
0         101  BeanRoasters Company     Coffee Beans      Toronto      True
1         102    SweetTreats Bakery           Bakery  Mississauga      True
2         103       MapleMilk Dairy     Milk & Dairy      Toronto      True

------------------------------------------------------------------------------------------
details:    SupplierID                     Email DeliveryFrequency  \
0         101  contact@beanroasters.com            Weekly   
1         102       info@sweettreats.ca             Daily   
2         103        sales@maplemilk.ca            Weekly   

                                Notes  
0  Name updated and expanded offering  
1                      Same as before  
2          Changed delivery frequency  
||||||||||||||||||||||||||||||||||| [Data Types for Reference] |||||||||||||||||||||||||||||||||||
SupplierID           int64
SupplierName        object
SupplierCategory    object

# Staging & Metadata
---
### Notes:
- first day has only 8 entreis anyway
- `source_date` for day 1 will be null since we don't *actually know* the source date.
  - day 2 will have a source date of '2025-01-01'
  - day 3 will have a source date of '2025-01-02'

In [62]:

def stage(df_supp, df_det, day='day1'):
    # merge the two dataframes based on the supplier_id
    df_staging = df_supp.merge(df_det, on="SupplierID", how="inner")

    # rename columns for consistency
    df_staging = df_staging.rename(columns={
        'SupplierID': 'supplier_id',
        'SupplierName': 'supplier_name',
        'SupplierCategory': 'supplier_category',
        'City': 'city',
        'IsActive': "is_active",
        'Email': 'email',
        'DeliveryFrequency': 'delivery_frequency',
        'Notes': 'notes'
    })

    # Add the supplier_key column for later
    df_staging.insert(0, 'supplier_key', range(1, len(df_staging) + 1))

    # Drop the duplicate column (i kept having errors here)
    if 'supplier_id.1' in df_staging.columns:
        df_staging = df_staging.drop(columns=['supplier_id.1'])

    # -- add metadata --
    # add the source date
    df_staging.insert(len(df_staging.columns), "source_date", get_days_mapping(day), allow_duplicates=True)
    
    # update "last_updated_at" using isoformat
    df_staging.insert(len(df_staging.columns), "last_updated_at", datetime.now().isoformat(timespec='seconds'), allow_duplicates=True)

    return df_staging

# Load Current Dimension & Delta Detection/SCD Logic

In [63]:
def load():
    # Get the staging & active df
    df_staging = stage(suppliers, details, CURRENT_DAY)
    df_active = dim_suppliers

    # -------- Dat 1: Initial Load ------------
    # If we're in day 1, load the entire thing since it has no deltas
    if df_active.empty:
        df_staging.to_csv(DIM_SUPPLIERS_PATH, index=False)
        print("day 1")
        return

    # -------- Day 2+: Loads  --------
    print("Finding deltas for day:", CURRENT_DAY[-1]) # lowkey overkill

    # Get the rows that are active
    df_active_current = df_active[df_active['is_active'] == True].copy()
    
    # -------- Compare 'df_active_current' with 'df_staging' --------
    dim_suppliers_result = df_active.copy() # The df we will be updating & loading into the csv

    # Reset indicies (for some reason they're different)
    df_active_current = df_active_current.reset_index(drop=True)
    df_staging = df_staging.reset_index(drop=True)

    # [Get New Suppliers and append to the result]
    active_ids = set(df_active_current['supplier_id'])
    staging_ids = set(df_staging['supplier_id'])
    new_ids = staging_ids-active_ids
    new_suppliers = df_staging[df_staging['supplier_id'].isin(new_ids)]

    # Append new 
    dim_suppliers_result = pd.concat([dim_suppliers_result, new_suppliers])

    # Now for every existing supplier, compare staging row to the current active row 


load()


Finding deltas for day: 2
