In [0]:
from pyspark.sql.functions import to_date
last_load_date_row = spark.sql("""
       SELECT MAX(last_load_date) AS last_load_date --suppose we are taking business_date
       FROM edl_hc_datamart.audit.audit_ingestion
       WHERE pipeline_name = 'bronze_ingest_pipeline' AND last_status = 'Success'
   """).collect()

last_load_date = last_load_date_row[0]['last_load_date'] if last_load_date_row and last_load_date_row[0]['last_load_date'] else None
formatted_date = None  # Minimal fix: ensure variable is always defined
# print(last_load_date)
if last_load_date:
    formatted_date = last_load_date.strftime("%Y-%m-%d")
    display(formatted_date)
else:
    formatted_date = "1900-01-01"
    display(formatted_date)

dbutils.widgets.text("load_date", formatted_date)
load_date = dbutils.widgets.get("load_date")
displayHTML(load_date)

In [0]:
#begin
from pyspark.sql import functions as F
from pyspark.sql import types as T
import json, ast

# --- ADD: imports for date handling ---
import re
from datetime import datetime

CATALOG  = "edl_hc_datamart"
SCHEMA   = "bronze"
META_CSV = "/Volumes/edl_hc_datamart/config/metadata_config/ingestion_metadata.csv"

# --- ADD: date parsing and parameter retrieval ---
def parse_load_date(value: str) -> str:
    """Validate yyyy-MM-dd and return normalized string."""
    if not value:
        raise ValueError("load_date is empty.")
    if not re.match(r"^\d{4}-\d{2}-\d{2}$", value):
        raise ValueError(f"Invalid load_date '{value}'. Expected format yyyy-MM-dd.")
    datetime.strptime(value, "%Y-%m-%d")
    return value

    # 2) Widgets
    try:
        dbutils.widgets.text("load_date", "")
        ld = dbutils.widgets.get("load_date")
        if ld:
            return parse_load_date(ld)
    except Exception as e:
        print(f"[WARN] widgets unavailable: {e}")

    # 3) Default
    today = datetime.utcnow().strftime("%Y-%m-%d")
    print(f"[INFO] defaulting load_date to {today} (UTC)")
    return today

# --- ADD: load_date value early for logging & reuse ---
# LOAD_DATE = get_load_date()
LOAD_DATE = load_date
print(f"[BRONZE] Using load_date: {LOAD_DATE}")

def trim_columns(df):
    return df.select([F.col(c).alias(c.strip()) for c in df.columns])

def parse_options(raw, fmt, landed_path):
    fmt = (fmt or "csv").strip().lower()
    if not raw or raw.strip() == "":
        raw = ""
    s = (raw.strip()
         .replace('“','"').replace('”','"')
         .replace("‘","'").replace("’","'"))
    # 1) JSON
    try:
        return json.loads(s)
    except: pass
    # 2) Python literal dict
    try:
        lit = ast.literal_eval(s)
        if isinstance(lit, dict):
            return lit
    except: pass
    # 3) key=value;key=value
    try:
        opts = {}
        for part in s.split(';'):
            if '=' in part:
                k, v = part.split('=', 1)
                k = k.strip(); v = v.strip()
                if v.lower() in ('true','false'):
                    opts[k] = (v.lower() == 'true')
                else:
                    opts[k] = v.strip('"').strip("'")
        if opts: return opts
    except: pass
    # 4) defaults
    if fmt == "csv":
        if landed_path and landed_path.lower().endswith(".tsv"):
            return {"header": True, "sep": "\t", "inferSchema": True}
        return {"header": True, "inferSchema": True}
    if fmt == "json":
        return {"multiline": True}
    if fmt == "tsv":
        return {"header": True, "sep": "\t", "inferSchema": True}
    return {}

def source_file_col(df, landed_path: str):
    if "_metadata" in df.columns:
        try:
            _ = df.select(F.col("_metadata.file_path")).limit(1).collect()
            return F.col("_metadata.file_path")
        except: pass
    return F.lit(landed_path)

def ingestion_ts_col(ing_str: str):
    norm = F.regexp_replace(F.lit(ing_str or ""), r"T", " ")
    ts   = F.try_to_timestamp(norm)
    return F.coalesce(ts, F.current_timestamp())

# -----------------------------------------
# UC catalog/schema
# -----------------------------------------
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
spark.sql(f"CREATE SCHEMA  IF NOT EXISTS {CATALOG}.{SCHEMA}")
spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"USE {SCHEMA}")

# ------------------------
# Read metadata
# ------------------------
meta_df = trim_columns(spark.read.option("header", True).csv(META_CSV))

# ---------------------------
# Loop metadata rows
# ---------------------------
for m in meta_df.collect():
    md = {k: m[k] for k in meta_df.columns}

    source_name  = md.get('source_name')
    landed_path  = md.get('landed_path')
    bronze_table = md.get('bronze_table')
    source_type  = (md.get('source_type') or "file").lower()
    fmt          = (md.get('format') or 'csv').lower()
    options      = parse_options(md.get('options'), fmt, landed_path)

    if not source_name or not bronze_table:
        print(f"Skipping row: missing source_name or table {md}")
        continue

    full_table = f"{CATALOG}.{bronze_table}"
    print(f"\nProcessing {source_name} (type={source_type}) -> {full_table}")

    # Only process file sources in this run
    if source_type != "file":
        print(f"Skipping non-file source: {source_type} ({source_name})")
        continue

    if not landed_path:
        print(f"Skipping row: missing landed_path for file source {md}")
        continue

    # ----- OPTIONAL: if your landed paths are partitioned by date, e.g. /raw/hr/YYYY-MM-DD -----
    # If so, you can derive a dated path:
    # landed_path = f"{landed_path.rstrip('/')}/{LOAD_DATE}"

    # ----- read dataset -----
    reader = spark.read
    for k, v in options.items():
        reader = reader.option(k, v)
    df = reader.format(fmt).load(landed_path)

    # 3. Filter for incremental rows
    if md.get('pipeline_name') == 'bronze_ingest_pipeline' and last_load_date:
        df = df.filter(df['business_date'] > last_load_date)

    # Print schema for visibility
    print("[SCHEMA] Input DF:")
    # df.printSchema()

    # -----------------------------------------------------
    # Robust JSON parsing: from_json + schema for 'compensation'
    # -----------------------------------------------------
    if "compensation" in df.columns:
        comp_field = next((f for f in df.schema.fields if f.name == "compensation"), None)
        comp_dt = comp_field.dataType if comp_field else None
        print(f"[INFO] compensation datatype: {comp_dt}")

        if isinstance(comp_dt, T.StructType):
            df = (df
                .withColumn("salary_amount",         F.col("compensation.salary.amount").cast("double"))
                .withColumn("salary_currency",       F.col("compensation.salary.currency"))
                .withColumn("salary_frequency",      F.col("compensation.salary.frequency"))
                .withColumn("salary_effective_from", F.to_date(F.col("compensation.salary.effective_from")))
                .withColumn("salary_effective_to",   F.to_date(F.col("compensation.salary.effective_to")))
                .drop("compensation")
            )
        elif isinstance(comp_dt, T.StringType):
            comp_schema = T.StructType([
                T.StructField("salary", T.StructType([
                    T.StructField("effective_to",   T.StringType(), True),
                    T.StructField("amount",         T.DoubleType(), True),
                    T.StructField("currency",       T.StringType(), True),
                    T.StructField("effective_from", T.StringType(), True),
                    T.StructField("frequency",      T.StringType(), True),
                ]), True)
            ])
            df = (df
                .withColumn("compensation_struct", F.from_json(F.col("compensation"), comp_schema))
                .withColumn("salary_amount",         F.col("compensation_struct.salary.amount"))
                .withColumn("salary_currency",       F.col("compensation_struct.salary.currency"))
                .withColumn("salary_frequency",      F.col("compensation_struct.salary.frequency"))
                .withColumn("salary_effective_from", F.to_date(F.col("compensation_struct.salary.effective_from")))
                .withColumn("salary_effective_to",   F.to_date(F.col("compensation_struct.salary.effective_to")))
                .drop("compensation_struct")
                .drop("compensation")
            )
        else:
            print(f"[WARN] Unexpected compensation type: {comp_dt}. Skipping flattening.")

    # ----------------------------
    # Metadata enrichment
    # ----------------------------
    file_path = source_file_col(df, landed_path)
    ing_ts    = ingestion_ts_col(md.get('ingestion_ts'))

    df_enriched = (df
        .withColumn("_meta_pipeline_name",  F.lit(md.get('pipeline_name')))
        .withColumn("_meta_source_name",    F.lit(source_name))
        .withColumn("_meta_batch_id",       F.lit(md.get('batch_id')))
        .withColumn("_meta_run_id",         F.lit(md.get('run_id')))
        .withColumn("_meta_schema_version", F.lit(md.get('schema_version')))
        .withColumn("_meta_producer_system",F.lit(md.get('producer_system')))
        .withColumn("_meta_ingestion_user", F.lit(md.get('ingestion_user')))
        .withColumn("_meta_ingestion_ts",   ing_ts)
        .withColumn("_meta_source_file",    file_path)
        .withColumn("_meta_load_time",      F.current_timestamp())
        .withColumn("_meta_load_date",      F.lit(LOAD_DATE))   # <-- ADDED
    ).drop("compensation")
    # df_enriched.printSchema()
    # display(df_enriched)

    # ----------------------------
    # Write to UC table (overwrite as per your original)
    # ----------------------------
    (df_enriched.write
        .mode("overwrite")
        .format("delta")
        .option("mergeSchema", "true")
        .option("overwriteSchema", "true")
        .saveAsTable(full_table)
    )

    print(f"Written to {full_table} for load_date={LOAD_DATE}")


In [0]:
from pyspark.sql.functions import to_date
last_load_date_row = spark.sql("""
       SELECT MAX(business_date) AS load_date
       FROM edl_hc_datamart.bronze.employees
   """)
# display(last_load_date_row)
# Extract the value from the DataFrame
last_load_date = last_load_date_row.collect()[0]['load_date'] if last_load_date_row.count() > 0 else None
# Format as string (YYYY-MM-DD)
load_date = last_load_date.strftime("%Y-%m-%d") if last_load_date else ""

display(load_date)

In [0]:
from datetime import datetime, date, timezone

AUDIT_TABLE = "edl_hc_datamart.audit.audit_ingestion"

def update_audit_ingestion(md, records_read, records_written, status, columns_loaded, layer, error_message=None):
    now = datetime.now(timezone.utc)
    attempt_val = md.get('attempt')
    if attempt_val is None or attempt_val == "":
        attempt_val = 1
    else:
        try:
            attempt_val = int(attempt_val)
        except Exception:
            attempt_val = 1
    audit_dict = {
        "pipeline_name":            str(md.get('pipeline_name') or ""),
        "source_type":              str(md.get('source_type') or "file"),
        "source_name":              str(md.get('source_name') or ""),
        "bronze_table":             str(md.get('bronze_table') or ""),
        "batch_id":                 str(md.get('batch_id') or ""),
        "run_id":                   str(md.get('run_id') or ""),
        "trigger_type":             str(md.get('trigger_type') or ""),
        "attempt":                  attempt_val,
        "run_start_ts":             now,
        "run_end_ts":               now,
        "duration_ms":              int(0),
        "last_status":              str(status),
        "records_read":             int(records_read),
        "records_written":          int(records_written),
        "error_count":              int(1 if error_message else 0),
        "error_message":            str(error_message or ""),
        "watermark_col":            str(md.get('watermark_col') or ""),
        "last_success_watermark_value": str(md.get('last_success_watermark_value') or ""),
        "current_run_high_watermark_value": str(md.get('current_run_high_watermark_value') or ""),
        "file_checkpoint_path":     str(md.get('file_checkpoint_path') or ""),
        "schema_version_applied":   str(md.get('schema_version') or ""),
        "producer_system":          str(md.get('producer_system') or ""),
        "ingestion_user":           str(md.get('ingestion_user') or ""),
        "notes":                    str(md.get('notes') or ""),
        "last_load_date":           date.fromisoformat(load_date),  # Minimal fix
        "created_at":               now,
        "updated_at":               now
    }
    audit_df = spark.createDataFrame([audit_dict])
    # display(audit_df)
    audit_df.write.mode("append").option("mergeSchema", "true").format("delta").saveAsTable(AUDIT_TABLE)
    # display(audit_df)

# --- Integrate audit update in main loop ---
for m in meta_df.collect():
    md = {k: m[k] for k in meta_df.columns}
    source_name  = md.get('source_name')
    landed_path  = md.get('landed_path')
    bronze_table = md.get('bronze_table')
    source_type  = (md.get('source_type') or "file").lower()
    fmt          = (md.get('format') or 'csv').lower()
    options      = parse_options(md.get('options'), fmt, landed_path)

    # if not source_name or not bronze_table:
    #     print(f"Skipping row: missing source_name or table {md}")
    #     continue

    full_table = f"{CATALOG}.{bronze_table}"
    # print(f"\nProcessing {source_name} (type={source_type}) -> {full_table}")

    # if source_type != "file":
    #     print(f"Skipping non-file source: {source_type} ({source_name})")
    #     continue

    # if not landed_path:
    #     print(f"Skipping row: missing landed_path for file source {md}")
    #     continue

    try:
        reader = spark.read
        for k, v in options.items():
            reader = reader.option(k, v)
        df = reader.format(fmt).load(landed_path)
        records_read = df.count()
        # display(df)

        # Insert records for all bronze tables (employees, departments, etc.)
        (df.write
            .mode("overwrite")
            .option("mergeSchema", "true")
            .format("delta")
            .saveAsTable(full_table)
        )
        records_written = df.count()
        columns_loaded = ",".join(df.columns)
        layer = "bronze"
        # display(df)
        update_audit_ingestion(md, records_read, records_written, "Success", columns_loaded, layer)
        # print(f"Written to {full_table} for load_date={LOAD_DATE}")

    except Exception as e:
        columns_loaded = ""
        layer = "bronze"
        update_audit_ingestion(md, 0, 0, "Failure", columns_loaded, layer, error_message=str(e))
        print(f"[ERROR] Failed to load {source_name}: {e}")