In [0]:
# Setting widgets
from datetime import datetime
today_str = datetime.now().strftime("%Y-%m-%d")
dbutils.widgets.text("env", "aws")
dbutils.widgets.text("storage_root", "s3://pakeyj-data-sales")
dbutils.widgets.text("process_date", today_str)
dbutils.widgets.text("required_columns", "order_id,ingest_date,ingested_at")
dbutils.widgets.text("raw_table", 'raw_orders_daily')
dbutils.widgets.text("bronze_table", 'bronze_orders')
dbutils.widgets.text("data_label", 'orders')

env = dbutils.widgets.get("env")
storage_root = dbutils.widgets.get("storage_root")
process_date = dbutils.widgets.get("process_date")
required_columns = [c.strip() for c in dbutils.widgets.get("required_columns").split(",") if c.strip()]
raw_table = dbutils.widgets.get("raw_table")
bronze_table = dbutils.widgets.get("bronze_table")
data_label = dbutils.widgets.get("data_label")

print("env:", dbutils.widgets.get("env"))
print("storage_root:", dbutils.widgets.get("storage_root"))
print("process_date:", dbutils.widgets.get("process_date"))
print("required_columns:", dbutils.widgets.get("required_columns"))
print("raw_table:", dbutils.widgets.get("raw_table"))
print("bronze_table:", dbutils.widgets.get("bronze_table"))
print("data_label:", dbutils.widgets.get("data_label"))

In [0]:
from src.common.config import load_config
import importlib
import src.common.config as config

importlib.reload(config)

cfg = load_config(env=env, storage_root=storage_root, process_date=process_date)
storage_root = cfg.storage_root
# process_date = cfg.process_date
raw_path = cfg.paths[raw_table]
bronze_path= cfg.paths[bronze_table]

In [0]:
def peek(df, name, n=5):
    print(df.columns)
    print(f"\n=== {name} ===")
    print(f"rows: {df.count()}")
    df.show(n,truncate=False)

# foo=spark.read.json(raw_path)
# peek(foo, 'raw')

In [0]:
import json

widget_names = [
    "env",
    "storage_root",
    "process_date",
    "required_columns",
    "raw_table",
    "bronze_table",
    "data_label",
]

widgets_dict = {name: dbutils.widgets.get(name) for name in widget_names}

print(json.dumps(widgets_dict, indent=2))


In [0]:
from pyspark.sql import functions as F

def assert_not_empty(df,process_date,*,dataset_name = 'dataset'):
  if len(df.take(1)) == 0:
    raise ValueError(f"{dataset_name}: found no records for ingest_date: {process_date}") 

def require_columns(df, required_cols, *, dataset_name = 'dataset'):
  missing = [c for c in required_cols if c not in df.columns]
  if missing:
    raise ValueError(
      f"{dataset_name}: missing required columns: {missing}"
      f"Found: {df.columns}"
    )

def convert_to_delta_daily(df, bronze_path, process_date, *, dataset_name = 'dataset'):
  (df
    .write.format("delta")
    .mode("overwrite")
    .option("replaceWhere", f"ingest_date = '{process_date}'")
    .partitionBy("ingest_date")
    .save(bronze_path)
  )    

df = spark.read.json(raw_path).filter(F.col("ingest_date") == process_date)
assert_not_empty(df,process_date,dataset_name=data_label)
require_columns(df,required_columns,dataset_name=data_label)
convert_to_delta_daily(df,bronze_path, process_date,dataset_name = data_label)
