In [0]:
from datetime import datetime
import pytz

def ensure_widget(name: str, default: str):
    try:
        dbutils.widgets.get(name)      # already exists (job injected) -> keep it
    except Exception:
        dbutils.widgets.text(name, default)  # interactive -> create default

tz = pytz.timezone("America/Indiana/Indianapolis")
today_str = datetime.now(tz).strftime("%Y-%m-%d")

ensure_widget("env", "dev")
ensure_widget("storage_root", "s3://pakeyj-data-sales")
ensure_widget("process_date", today_str)  # job param overrides because widget already exists
ensure_widget("required_columns", "order_id,ingest_date,ingested_at")
ensure_widget("raw_table", "raw_orders_daily")
ensure_widget("bronze_table", "bronze_orders")
ensure_widget("silver_table", "silver_orders")
ensure_widget("data_label", "orders")

env = dbutils.widgets.get("env")
storage_root = dbutils.widgets.get("storage_root")
process_date = dbutils.widgets.get("process_date")
required_columns = [c.strip() for c in dbutils.widgets.get("required_columns").split(",") if c.strip()]
raw_table = dbutils.widgets.get("raw_table")
bronze_table = dbutils.widgets.get("bronze_table")
silver_table = dbutils.widgets.get("silver_table")
data_label = dbutils.widgets.get("data_label")

print("env:", dbutils.widgets.get("env"))
print("storage_root:", dbutils.widgets.get("storage_root"))
print("process_date:", dbutils.widgets.get("process_date"))
print("required_columns:", dbutils.widgets.get("required_columns"))
print("raw_table:", dbutils.widgets.get("raw_table"))
print("bronze_table:", dbutils.widgets.get("bronze_table"))
print("silver_table:", dbutils.widgets.get("silver_table"))
print("data_label:", dbutils.widgets.get("data_label"))

from src.common.config import load_config
import importlib
import src.common.config as config

def peek(df, name, n=5):
    print(df.columns)
    print(f"\n=== {name} ===")
    print(f"rows: {df.count()}")
    df.show(n,truncate=False)
    
importlib.reload(config)

cfg = load_config(env=env, storage_root=storage_root, process_date=process_date)
storage_root = cfg.storage_root
process_date = cfg.process_date
raw_path = cfg.paths[raw_table]
bronze_path= cfg.paths[bronze_table]
silver_path= cfg.paths[silver_table]
print(raw_path)
print(bronze_path)

In [0]:
import json

widget_names = [
    "env",
    "storage_root",
    "process_date",
    "required_columns",
    "raw_table",
    "bronze_table",
    "data_label",
]

widgets_dict = {name: dbutils.widgets.get(name) for name in widget_names}

print(json.dumps(widgets_dict, indent=2))


In [0]:
from pyspark.sql import functions as F

# s3://pakeyj-data-sales/raw/orders/ingest_date=2026-01-21
# s3://pakeyj-data-sales/dev/silver/orders

# foo = spark.read.json("s3://pakeyj-data-sales/raw/orders")
# foo = (spark.read.format("delta").load(bronze_path).filter(F.col("ingest_date") == process_date))
df = (spark.read.format("delta").load(f"{silver_path}_history"))
# df = (spark.read.format("delta").load("s3://pakeyj-data-sales/prod/bronze/orders"))

dupe_ids = (
    df.groupBy("order_id")
       .count()
       .filter(F.col("count") > 1)
       .select("order_id")
)

foo = (
    df.join(dupe_ids, on="order_id", how="inner")
       .orderBy(
           F.col("order_id").desc(),
        #    F.col("updated_ts").desc(),
           F.col("ingested_at").desc(),
       )
).drop("run_id")

peek(foo, n=100, name="raw dup histories")
