## TD Audit Configuration

In [0]:
import dlt
import json
from pyspark.sql.functions import *
from pyspark.sql.types import *
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pyspark.sql.window import Window
import uuid
from delta.tables import DeltaTable

In [0]:
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()

print(f"env_code: {lz_key}")  # This won't be redacted
print(f"env_name: {env_name}")  # This won't be redacted

KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
print(f"KeyVault_name: {KeyVault_name}") 

In [0]:
# Service principal credentials
client_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-TENANT-ID")

# Storage account names
curated_storage = f"ingest{lz_key}curated{env_name}"

# Spark config for curated storage (Delta table)
spark.conf.set(f"fs.azure.account.auth.type.{curated_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{curated_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{curated_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{curated_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{curated_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")


In [0]:
audit_delta_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ARM/AUDIT/TD/td_cr_audit_table"
hive_schema = "ariadm_arm_td"

In [0]:


def datetime_uuid():
    dt_str = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
    return str(uuid.uuid5(uuid.NAMESPACE_DNS, dt_str))

run_id_value = datetime_uuid()

audit_schema = StructType([
    StructField("Run_Id", StringType(), True),
    StructField("Unique_Identifier_Desc", StringType(), True),
    StructField("Unique_Identifier", StringType(), True),
    StructField("Table_Name", StringType(), True),
    StructField("Stage_Name", StringType(), True),
    StructField("Record_Count", IntegerType(), True),
    StructField("Run_DateTime", TimestampType(), True),
    StructField("Batch_Id", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("File_name", StringType(), True),
    StructField("Status", StringType(), True)
])

def create_audit_df(df: DataFrame, Unique_Identifier_Desc: str,Table_Name: str, Stage_Name: str, Description: str, file_name = False,status = False) -> None:
    """
    Creates an audit DataFrame and writes it to Delta format.

    :param df: Input DataFrame from which unique identifiers are extracted.
    :param Unique_Identifier_Desc: Column name that acts as a unique identifier.
    :param Table_Name: Name of the source table.
    :param Stage_Name: Name of the data processing stage.
    :param Description: Description of the table.
    :param additional_columns: options File_name or Status. List of additional columns to include in the audit DataFrame.
    """

    dt_desc = datetime.utcnow()

    additional_columns = []
    if file_name is True:
        additional_columns.append("File_name")
    if status is True:
        additional_columns.append("Status")


     # Default to an empty list if None   
    additional_columns = [col(c) for c in additional_columns if c is not None]  # Filter out None values

    audit_df = df.select(col(Unique_Identifier_Desc).alias("Unique_Identifier"),col("Batch_Id"),*additional_columns)\
    .withColumn("Run_Id", lit(run_id_value))\
        .withColumn("Unique_Identifier_Desc", lit(Unique_Identifier_Desc))\
            .withColumn("Stage_Name", lit(Stage_Name))\
                .withColumn("Table_Name", lit(Table_Name))\
                    .withColumn("Run_DateTime", lit(dt_desc).cast(TimestampType()))\
                        .withColumn("Description", lit(Description))

    list_cols = audit_df.columns

    final_audit_df = audit_df.groupBy(*list_cols).agg(count("*").cast(IntegerType()).alias("Record_Count"))

    # final_audit_df.write.format("delta").mode("append").option("mergeSchema","true").save(audit_delta_path)
    
    return final_audit_df

In [0]:
# Define Delta Table Path in Azure Storage


if not DeltaTable.isDeltaTable(spark, audit_delta_path):
    print(f"ðŸ›‘ Delta table '{audit_delta_path}' does not exist. Creating an empty Delta table...")

    # Create an empty DataFrame
    empty_df = spark.createDataFrame([], audit_schema)

    # Write the empty DataFrame in Delta format to create the table
    empty_df.write.format("delta").mode("overwrite").save(audit_delta_path)

    print("âœ… Empty Delta table successfully created in Azure Storage.")
else:
    print(f"âš¡ Delta table '{audit_delta_path}' already exists.")

In [0]:
audit_params = [
        {
        "Unique_Identifier_cols": ["CaseNo", "Forenames", "Name"],
        "Table_Name": "bronze_ac_ca_ant_fl_dt_hc",
        "Stage_Name": "bronze_stage",
        "Description": "The bronze_ac_ca_ant_fl_dt_hc table Delta Live Table combining Appeal Case data with Case Appellant, Appellant, File Location, Department, and Hearing Centre."
    },
    {
        "Unique_Identifier_cols": ["CaseNo", "Forenames", "Name"],
        "Table_Name": "bronze_iris_extract",
        "Stage_Name": "bronze_stage",
        "Description": "Delta Live Table extracted from the IRIS Tribunal decision file extract."
    },
    {
        "Unique_Identifier_cols": ["CaseNo"],
        "Table_Name": "stg_td_filtered",
        "Stage_Name": "segmentation_stage",
        "Description": "The stg_td_filtered - segmentation Table for appeal cases requiring tribunal decisions with unique list of CaseNo's"
    },
    {
        "Unique_Identifier_cols": ["CaseNo", "Forenames", "Name"],
        "Table_Name": "silver_tribunaldecision_detail",
        "Stage_Name": "silver_stage",
        "Description": "The silver_tribunaldecision_detail - or Tribunal Decision information"
    },
    {
        "Unique_Identifier_cols": ["client_identifier", "bf_002", "bf_003"],
        "Table_Name": "silver_archive_metadata",
        "Stage_Name": "silver_stage",
        "Description": "The silver_archive_metadata table consolidates keys metadata for Archive Metadata da"
    },
    {
        "Unique_Identifier_cols": ["CaseNo", "Forenames", "Name"],
        "Table_Name": "stg_create_td_iris_json_content",
        "Stage_Name": "silver_stage",
        "Description": "The stg_create_td_iris_json_content table generates JSON content for TD cases",
        "extra_columns": ["File_name", "Status"]
    },
    {
        "Unique_Identifier_cols": ["CaseNo", "Forenames", "Name"],
        "Table_Name": "stg_create_td_iris_html_content",
        "Stage_Name": "silver_stage",
        "Description": "The stg_create_td_iris_html_content table generates HTML content for TD cases",
        "extra_columns": ["File_name", "Status"]
    },
    {
        "Unique_Identifier_cols": ["client_identifier", "bf_002", "bf_003"],
        "Table_Name": "stg_create_td_iris_a360_content",
        "Stage_Name": "silver_stage",
        "Description": "The stg_create_td_iris_a360_content table generates A360 content for TD cases",
        "extra_columns": [ "Status"]
    },
    {
        "Unique_Identifier_cols": ["CaseNo", "Forenames", "Name"],
        "Table_Name": "stg_td_iris_unified",
        "Stage_Name": "silver_stage",
        "Description": "The stg_td_iris_unified table generates A360 BatchId for TD cases",
        "extra_columns": ["File_name", "Status"]
    },
    {
        "Unique_Identifier_cols": ["CaseNo", "Forenames", "Name"],
        "Table_Name": "gold_td_iris_with_html",
        "Stage_Name": "silver_stage",
        "Description": "The gold_td_iris_with_html with HTML Outputs Uploded..",
        "extra_columns": ["File_name", "Status"]
    },
    {
        "Unique_Identifier_cols": ["CaseNo", "Forenames", "Name"],
        "Table_Name": "gold_td_iris_with_json",
        "Stage_Name": "silver_stage",
        "Description": "The gold_td_iris_with_json with HTML Outputs Uploded..",
        "extra_columns": ["File_name", "Status"]
    },
    {
        "Unique_Identifier_cols": ["A360_BatchId"],
        "Table_Name": "gold_td_iris_with_a360",
        "Stage_Name": "silver_stage",
        "Description": "The gold_td_iris_with_a360 with HTML Outputs Uploded..",
        "extra_columns": ["File_name", "Status"]
    }
]

audit_dataframes = []

for params in audit_params:
    Table_Name = params["Table_Name"]
    Stage_Name = params["Stage_Name"]
    Unique_Identifier_cols = params["Unique_Identifier_cols"]
    Description = params["Description"]
    extra_columns = params["extra_columns"] if "extra_columns" in params else []
    Unique_Identifier_Desc = "_".join(Unique_Identifier_cols)

    try:

        df_logging = spark.read.table(f"hive_metastore.{hive_schema}.{Table_Name}")

        df_audit = df_logging
        if len(Unique_Identifier_cols) > 1:
            df_audit = df_audit.withColumn(
                Unique_Identifier_Desc, 
                concat_ws("_", *[col(c).cast("string") for c in Unique_Identifier_cols])
            )
        else:
            df_audit = df_audit.withColumn(Unique_Identifier_Desc, col(Unique_Identifier_Desc).cast("string"))

        # Add Batch_Id from A360_BatchId if present, else set to None
        if "A360_BatchId" in df_audit.columns:
            df_audit = df_audit.withColumn("Batch_Id", col("A360_BatchId").cast("string"))
        else:
            df_audit = df_audit.withColumn("Batch_Id", lit(None).cast("string"))

        # Apply extra column mappings dynamically
        if len(extra_columns) <= 1:
            missing_columns = list(set(["File_name", "Status"]) - set(extra_columns))
            for new_col in missing_columns:
                df_audit = df_audit.withColumn(new_col, lit(None))

        # Generate the audit DataFrame
        df_audit_appended = create_audit_df(
            df_audit,
            Unique_Identifier_Desc=Unique_Identifier_Desc,
            Table_Name=Table_Name,
            Stage_Name=Stage_Name,
            Description=Description,
            file_name = True,
            status = True
        )

        audit_dataframes.append(df_audit_appended)

        print(f"âœ… Successfully processed table: {Table_Name}")

    except Exception as e:
        print(f"ðŸ›‘ Failed to process table: {Table_Name}. Error: {str(e)}")
        failed_table = f"Table {Table_Name} does not exist"

        # Table does not exist, create an audit entry for it
        status = f"Failed - Table {Table_Name} does not exist"

        row_data = {
            "Run_Id": run_id_value,
            "Unique_Identifier_Desc": Unique_Identifier_Desc,
            "Unique_Identifier": None,
            "Table_Name": Table_Name,
            "Stage_Name": Stage_Name,
            "Record_Count": 0,
            "Run_DateTime": datetime.now(),
            "Batch_Id": None,
            "Description": Description,
            "File_name": None,
            "Status": status
        }

        row_df = spark.createDataFrame([row_data], schema=audit_schema)
        audit_dataframes.append(row_df)


df_final_audit = audit_dataframes[0]
for df in audit_dataframes[1:]:
    df_final_audit = df_final_audit.unionByName(df, allowMissingColumns=True)

In [0]:
df_final_audit.write.format("delta").mode("append").option("mergeSchema", "true").save(audit_delta_path)

In [0]:
dbutils.notebook.exit("Notebook completed successfully")

## Appendix

In [0]:
# df_final_audit = spark.read.format("delta").load(audit_delta_path)
# df_final_audit.createOrReplaceTempView("tv_final_audit")

In [0]:
# %sql
# select * from tv_final_audit
# where Batch_Id is not null

In [0]:
# df_final_audit.createOrReplaceTempView("tv_final_audit")

In [0]:
# %sql
# select * from tv_final_audit
# where Table_Name like 'gold%'

In [0]:
# spark.read.format("delta").load(audit_delta_path).printSchema()

In [0]:
# spark.read.format("delta").load(audit_delta_path).filter("Batch_Id IS NOT NULL").display()