## Audit Configuration

In [0]:
import dlt
import json
from pyspark.sql.functions import *
from pyspark.sql.types import *
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pyspark.sql.window import Window
import uuid
from delta.tables import DeltaTable

In [0]:
# audit_mnt = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/TD"
audit_delta_path = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/TD/td_cr_audit_table"

In [0]:


def datetime_uuid():
    dt_str = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
    return str(uuid.uuid5(uuid.NAMESPACE_DNS, dt_str))

run_id_value = datetime_uuid()

audit_schema = StructType([
    StructField("Runid", StringType(), True),
    StructField("Unique_identifier_desc", StringType(), True),
    StructField("Unique_identifier", StringType(), True),
    StructField("Table_name", StringType(), True),
    StructField("Stage_name", StringType(), True),
    StructField("Record_count", IntegerType(), True),
    StructField("Run_dt", TimestampType(), True),
    StructField("Batch_id", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("File_name", StringType(), True),
    StructField("Status", StringType(), True)
])

def create_audit_df(df: DataFrame, unique_identifier_desc: str, table_name: str, stage_name: str, description: str, additional_columns: list = None) -> DataFrame:
    """
    Creates an audit DataFrame and writes it to Delta format.

    :param df: Input DataFrame from which unique identifiers are extracted.
    :param unique_identifier_desc: Column name that acts as a unique identifier.
    :param table_name: Name of the source table.
    :param stage_name: Name of the data processing stage.
    :param description: Description of the table.
    :param additional_columns: List of additional columns to include in the audit DataFrame.
    :return: DataFrame containing the audit information.
    """

    dt_desc = datetime.utcnow()

    additional_columns = additional_columns or []  # Default to an empty list if None   
    additional_columns = [col(c) for c in additional_columns if c is not None]  # Filter out None values

    audit_df = df.select(col(unique_identifier_desc).alias("unique_identifier"), *additional_columns) \
        .withColumn("Runid", lit(run_id_value)) \
        .withColumn("Unique_identifier_desc", lit(unique_identifier_desc)) \
        .withColumn("Stage_name", lit(stage_name)) \
        .withColumn("Table_name", lit(table_name)) \
        .withColumn("Run_dt", lit(dt_desc).cast(TimestampType())) \
        .withColumn("Description", lit(description))

    list_cols = audit_df.columns

    final_audit_df = audit_df.groupBy(*list_cols).agg(count("*").cast(IntegerType()).alias("Record_count"))

    # final_audit_df.write.format("delta").mode("append").option("mergeSchema", "true").save(audit_delta_path)
    
    return final_audit_df

In [0]:
# Define Delta Table Path in Azure Storage


if not DeltaTable.isDeltaTable(spark, audit_delta_path):
    print(f"🛑 Delta table '{audit_delta_path}' does not exist. Creating an empty Delta table...")

    # Create an empty DataFrame
    empty_df = spark.createDataFrame([], audit_schema)

    # Write the empty DataFrame in Delta format to create the table
    empty_df.write.format("delta").mode("overwrite").save(audit_delta_path)

    print("✅ Empty Delta table successfully created in Azure Storage.")
else:
    print(f"⚡ Delta table '{audit_delta_path}' already exists.")

In [0]:
audit_params = [
        {
        "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
        "table_name": "bronze_ac_ca_ant_fl_dt_hc",
        "stage_name": "bronze_stage",
        "description": "The bronze_ac_ca_ant_fl_dt_hc table Delta Live Table combining Appeal Case data with Case Appellant, Appellant, File Location, Department, and Hearing Centre."
    },
    {
        "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
        "table_name": "bronze_iris_extract",
        "stage_name": "bronze_stage",
        "description": "Delta Live Table extracted from the IRIS Tribunal decision file extract."
    },
    {
        "unique_identifier_cols": ["CaseNo"],
        "table_name": "stg_td_filtered",
        "stage_name": "segmentation_stage",
        "description": "The stg_td_filtered - segmentation Table for appeal cases requiring tribunal decisions with unique list of CaseNo's"
    },
    {
        "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
        "table_name": "silver_tribunaldecision_detail",
        "stage_name": "silver_stage",
        "description": "The silver_tribunaldecision_detail - or Tribunal Decision information"
    },
    {
        "unique_identifier_cols": ["client_identifier", "bf_002", "bf_003"],
        "table_name": "silver_archive_metadata",
        "stage_name": "silver_stage",
        "description": "The silver_archive_metadata table consolidates keys metadata for Archive Metadata da"
    },
    {
        "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
        "table_name": "stg_create_td_iris_json_content",
        "stage_name": "silver_stage",
        "description": "The stg_create_td_iris_json_content table generates JSON content for TD cases",
        "Extra_columns_mapping": {"File_name": "JSONFileName", "Status": "JSONStatus"}
    },
    {
        "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
        "table_name": "stg_create_td_iris_html_content",
        "stage_name": "silver_stage",
        "description": "The stg_create_td_iris_html_content table generates HTML content for TD cases",
        "Extra_columns_mapping": {"File_name": "HTMLFileName", "Status": "HTMLStatus"}
    },
    {
        "unique_identifier_cols": ["client_identifier", "bf_002", "bf_003"],
        "table_name": "stg_create_td_iris_a360_content",
        "stage_name": "silver_stage",
        "description": "The stg_create_td_iris_a360_content table generates A360 content for TD cases",
        "Extra_columns_mapping": {"File_name": "NotYetBatched", "Status": "A360Status"}
    },
    {
        "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
        "table_name": "stg_td_iris_unified",
        "stage_name": "silver_stage",
        "description": "The stg_td_iris_unified table generates A360 BatchId for TD cases",
        "Extra_columns_mapping": {"File_name": "A360FileName", "Status": "A360Status"}
    },
    {
        "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
        "table_name": "gold_td_iris_with_html",
        "stage_name": "silver_stage",
        "description": "The gold_td_iris_with_html with HTML Outputs Uploded..",
        "Extra_columns_mapping": {"File_name": "HTMLFileName", "Status": "UploadStatus"}
    },
    {
        "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
        "table_name": "gold_td_iris_with_json",
        "stage_name": "silver_stage",
        "description": "The gold_td_iris_with_json with HTML Outputs Uploded..",
        "Extra_columns_mapping": {"File_name": "JSONFileName", "Status": "UploadStatus"}
    },
    {
        "unique_identifier_cols": ["A360BatchId"],
        "table_name": "gold_td_iris_with_a360",
        "stage_name": "silver_stage",
        "description": "The gold_td_iris_with_a360 with HTML Outputs Uploded..",
        "Extra_columns_mapping": {"File_name": "A360FileName", "Status": "UploadStatus"}
    }
]

audit_dataframes = []

for params in audit_params:
    table_name = params["table_name"]
    stage_name = params["stage_name"]
    unique_identifier_cols = params["unique_identifier_cols"]
    description = params["description"]
    extra_columns_mapping = params.get("Extra_columns_mapping", {})
    unique_identifier_desc = "_".join(unique_identifier_cols)

    try:

        df_logging = spark.read.table(f"hive_metastore.ariadm_arm_td.{table_name}")

        df_audit = df_logging
        if len(unique_identifier_cols) > 1:
            df_audit = df_audit.withColumn(
                unique_identifier_desc, 
                concat_ws("_", *[col(c).cast("string") for c in unique_identifier_cols])
            )
        else:
            df_audit = df_audit.withColumn(unique_identifier_desc, col(unique_identifier_desc))

        # Apply extra column mappings dynamically
        for new_col, source_col in extra_columns_mapping.items():
            if source_col == "NotYetBatched":
                df_audit = df_audit.withColumn(new_col, lit("NotYetBatched"))
            else:
                df_audit = df_audit.withColumn(new_col, col(source_col))

        # Generate the audit DataFrame
        df_audit_appended = create_audit_df(
            df_audit,
            unique_identifier_desc=unique_identifier_desc,
            table_name=table_name,
            stage_name=stage_name,
            description=description
        )

        audit_dataframes.append(df_audit_appended)

    except Exception as e:

        # Table does not exist, create an audit entry for it
        status = f"Failed - Table {table_name} does not exist"

        row_data = {
            "Runid": run_id_value,
            "Unique_identifier_desc": unique_identifier_desc,
            "Unique_identifier": None,
            "Table_name": table_name,
            "Stage_name": stage_name,
            "Record_count": 0,
            "Run_dt": datetime.now(),
            "Batch_id": None,
            "Description": description,
            "File_name": None,
            "Status": status
        }

        row_df = spark.createDataFrame([row_data], schema=audit_schema)
        audit_dataframes.append(row_df)

df_final_audit = audit_dataframes[0]
for df in audit_dataframes[1:]:
    df_final_audit = df_final_audit.unionByName(df, allowMissingColumns=True)



In [0]:
# audit_params = [
#         {
#         "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
#         "table_name": "bronze_ac_ca_ant_fl_dt_hc",
#         "stage_name": "bronze_stage",
#         "description": "The bronze_ac_ca_ant_fl_dt_hc table Delta Live Table combining Appeal Case data with Case Appellant, Appellant, File Location, Department, and Hearing Centre."
#     },
#     {
#         "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
#         "table_name": "bronze_iris_extract",
#         "stage_name": "bronze_stage",
#         "description": "Delta Live Table extracted from the IRIS Tribunal decision file extract."
#     },
#     {
#         "unique_identifier_cols": ["CaseNo"],
#         "table_name": "stg_td_filtered",
#         "stage_name": "silver_stage",
#         "description": "The stg_td_filtered - segmentation Table for appeal cases requiring tribunal decisions with unique list of CaseNo's"
#     },
#     {
#         "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
#         "table_name": "silver_tribunaldecision_detail",
#         "stage_name": "silver_stage",
#         "description": "The silver_tribunaldecision_detail - or Tribunal Decision information"
#     },
#     {
#         "unique_identifier_cols": ["client_identifier", "bf_002", "bf_003"],
#         "table_name": "silver_archive_metadata",
#         "stage_name": "silver_stage",
#         "description": "The silver_archive_metadata table consolidates keys metadata for Archive Metadata da"
#     },
#     {
#         "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
#         "table_name": "stg_create_td_iris_json_content",
#         "stage_name": "silver_stage",
#         "description": "The stg_create_td_iris_json_content table generates JSON content for TD cases",
#         "Extra_columns_mapping": {"File_name": "JSONFileName", "Status": "JSONStatus"}
#     },
#     {
#         "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
#         "table_name": "stg_create_td_iris_html_content",
#         "stage_name": "silver_stage",
#         "description": "The stg_create_td_iris_html_content table generates HTML content for TD cases",
#         "Extra_columns_mapping": {"File_name": "HTMLFileName", "Status": "HTMLStatus"}
#     },
#     {
#         "unique_identifier_cols": ["client_identifier", "bf_002", "bf_003"],
#         "table_name": "stg_create_td_iris_a360_content",
#         "stage_name": "silver_stage",
#         "description": "The stg_create_td_iris_a360_content table generates A360 content for TD cases",
#         "Extra_columns_mapping": {"File_name": "NotYetBatched", "Status": "A360Status"}
#     },
#     {
#         "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
#         "table_name": "stg_td_iris_unified",
#         "stage_name": "silver_stage",
#         "description": "The stg_td_iris_unified table generates A360 BatchId for TD cases",
#         "Extra_columns_mapping": {"File_name": "A360FileName", "Status": "A360Status"}
#     },
#     {
#         "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
#         "table_name": "gold_td_iris_with_html",
#         "stage_name": "silver_stage",
#         "description": "The gold_td_iris_with_html with HTML Outputs Uploded..",
#         "Extra_columns_mapping": {"File_name": "HTMLFileName", "Status": "UploadStatus"}
#     },
#     {
#         "unique_identifier_cols": ["CaseNo", "Forenames", "Name"],
#         "table_name": "gold_td_iris_with_json",
#         "stage_name": "silver_stage",
#         "description": "The gold_td_iris_with_json with HTML Outputs Uploded..",
#         "Extra_columns_mapping": {"File_name": "JSONFileName", "Status": "UploadStatus"}
#     },
#     {
#         "unique_identifier_cols": ["A360BatchId"],
#         "table_name": "gold_td_iris_with_a360",
#         "stage_name": "silver_stage",
#         "description": "The gold_td_iris_with_a360 with HTML Outputs Uploded..",
#         "Extra_columns_mapping": {"File_name": "A360FileName", "Status": "UploadStatus"}
#     },
#     {
#         "unique_identifier_cols": ["A360BatchId"],
#         "table_name": "temp",
#         "stage_name": "silver_stage",
#         "description": "The gold_td_iris_with_a360 with HTML Outputs Uploded..",
#         "Extra_columns_mapping": {"File_name": "A360FileName", "Status": "UploadStatus"}
#     }
# ]

# audit_dataframes = []

# for params in audit_params:
#     table_name = params["table_name"]
#     stage_name = params["stage_name"]
#     unique_identifier_cols = params["unique_identifier_cols"]
#     description = params["description"]
#     extra_columns_mapping = params.get("Extra_columns_mapping", {})
#     unique_identifier_desc = "_".join(unique_identifier_cols)

#     try:

#         df_logging = spark.read.table(f"hive_metastore.ariadm_arm_td.{table_name}")

#         df_audit = df_logging
#         if len(unique_identifier_cols) > 1:
#             df_audit = df_audit.withColumn(
#                 unique_identifier_desc, 
#                 concat_ws("_", *[col(c).cast("string") for c in unique_identifier_cols])
#             )
#         else:
#             df_audit = df_audit.withColumn(unique_identifier_desc, col(unique_identifier_desc))

#         # Apply extra column mappings dynamically
#         for new_col, source_col in extra_columns_mapping.items():
#             if source_col == "NotYetBatched":
#                 df_audit = df_audit.withColumn(new_col, lit("NotYetBatched"))
#             else:
#                 df_audit = df_audit.withColumn(new_col, col(source_col))

#         # Generate the audit DataFrame
#         df_audit_appended = create_audit_df(
#             df_audit,
#             unique_identifier_desc=unique_identifier_desc,
#             table_name=table_name,
#             stage_name=stage_name,
#             description=description
#         )

#         audit_dataframes.append(df_audit_appended)

#     except Exception as e:

#         # Table does not exist, create an audit entry for it
#         status = f"Failed - Table {table_name} does not exist"

#         row_data = {
#             "Runid": run_id_value,
#             "Unique_identifier_desc": unique_identifier_desc,
#             "Unique_identifier": None,
#             "Table_name": table_name,
#             "Stage_name": stage_name,
#             "Record_count": 0,
#             "Run_dt": datetime.now(),
#             "Batch_id": None,
#             "Description": description,
#             "File_name": None,
#             "Status": status
#         }

#         row_df = spark.createDataFrame([row_data], schema=audit_schema)
#         audit_dataframes.append(row_df)

# df_final_audit = audit_dataframes[0]
# for df in audit_dataframes[1:]:
#     df_final_audit = df_final_audit.unionByName(df, allowMissingColumns=True)



In [0]:
display(df_final_audit)

In [0]:
df_final_audit.write.format("delta").mode("append").option("mergeSchema", "true").save(audit_delta_path)