In [0]:
# --- Landing -> Bronze ETL (append) + metadata/run-log ---
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit, current_timestamp, current_date
from pyspark.sql.types import TimestampType
import re, uuid, time

# Configuration
landing_path = "/Volumes/leelastestdata/default/rawdata/landing/"
catalog = "leelastestdata"
schema = "default"
control_schema = schema         # you can use a separate schema e.g., "etl_control"

# Table names for metadata & runlog
meta_table = f"{catalog}.{control_schema}.table_last_processed"
runlog_table = f"{catalog}.{control_schema}.etl_run_log"

# Helper readers
def read_file(path: str, fname: str) -> DataFrame:
    lower = fname.lower()
    if lower.endswith(".csv"):
        return spark.read.option("header", "true").option("inferSchema", "true").csv(path)
    if lower.endswith(".json"):
        return spark.read.json(path)
    if lower.endswith(".parquet"):
        return spark.read.parquet(path)
    return None

# Ensure metadata & runlog exist (create empty tables if not)
if not spark.catalog.tableExists(meta_table):
    spark.createDataFrame([], "source_table string, last_processed_timestamp timestamp, last_processed_file string, last_updated timestamp").write.format("delta").mode("overwrite").saveAsTable(meta_table)

if not spark.catalog.tableExists(runlog_table):
    spark.createDataFrame([], "run_id string, source_table string, start_ts timestamp, end_ts timestamp, status string, rows_read long, message string").write.format("delta").mode("overwrite").saveAsTable(runlog_table)

# List landing files
files = dbutils.fs.ls(landing_path)

for f in files:
    file_path = f.path
    file_name = f.name
    table_base = re.sub(r"\.[^.]+$", "", file_name).lower()
    bronze_table = f"{catalog}.{schema}.{table_base}_bronze"
    run_id = str(uuid.uuid4())
    start_ts = spark.sql("select current_timestamp() as ts").collect()[0]["ts"]

    print(f"\n=== Processing file: {file_name} -> {bronze_table} ===")

    try:
        df = read_file(file_path, file_name)
        if df is None:
            print(f"Skipping unsupported file: {file_name}")
            continue

        # Add audit columns (use file_path; avoid input_file_name)
        df = df.withColumn("source_file", lit(file_path)) \
               .withColumn("load_timestamp", current_timestamp().cast(TimestampType())) \
               .withColumn("load_date", current_date())

        rows_read = df.count()
        if rows_read == 0:
            print(f"Skipping empty file: {file_name}")
            # log run as SKIPPED / zero rows
            spark.createDataFrame([(run_id, bronze_table, start_ts, spark.sql("select current_timestamp() as ts").collect()[0]["ts"], "SKIPPED", 0, "empty file")],
                                 ["run_id","source_table","start_ts","end_ts","status","rows_read","message"]) \
                 .write.format("delta").mode("append").saveAsTable(runlog_table)
            continue

        # If bronze table doesn't exist, create it partitioned by load_date for faster pruning
        if not spark.catalog.tableExists(bronze_table):
            print(f"Creating bronze table: {bronze_table} (partitioned by load_date)")
            df.write.format("delta").mode("overwrite").option("mergeSchema","true").partitionBy("load_date").saveAsTable(bronze_table)
        else:
            df.write.format("delta").mode("append").option("mergeSchema","true").saveAsTable(bronze_table)

        # Update metadata: set last_processed_timestamp & last_processed_file for this bronze_table
        current_ts = spark.sql("select current_timestamp() as ts").collect()[0]["ts"]
        # Remove existing row if any then upsert (simple overwrite of control record)
        # here we perform simple delete+insert using SQL
        spark.sql(f"DELETE FROM {meta_table} WHERE source_table = '{bronze_table}'")
        spark.createDataFrame([(bronze_table, current_ts, file_path, current_ts)],
                             ["source_table","last_processed_timestamp","last_processed_file","last_updated"]) \
             .write.format("delta").mode("append").saveAsTable(meta_table)

        # log success
        spark.createDataFrame([(run_id, bronze_table, start_ts, current_ts, "SUCCESS", rows_read, f"file:{file_name}")],
                             ["run_id","source_table","start_ts","end_ts","status","rows_read","message"]) \
             .write.format("delta").mode("append").saveAsTable(runlog_table)

        print(f"âœ” Loaded {rows_read} rows into {bronze_table}")
    except Exception as e:
        end_ts = spark.sql("select current_timestamp() as ts").collect()[0]["ts"]
        spark.createDataFrame([(run_id, bronze_table, start_ts, end_ts, "FAILED", 0, str(e))],
                             ["run_id","source_table","start_ts","end_ts","status","rows_read","message"]) \
             .write.format("delta").mode("append").saveAsTable(runlog_table)
        print(f"Error processing {file_name}: {e}")
        # continue to next file


In [0]:
spark.sql("SHOW PARTITIONS leelastestdata.default.applications_bronze").show()