## JOH Audit Configuration 

In [0]:
import dlt
import json
from pyspark.sql.functions import *
from pyspark.sql.types import *
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pyspark.sql.window import Window
import uuid
from delta.tables import DeltaTable

In [0]:
# Get the current Databricks workspace URL from Spark configuration
workspace_url = spark.conf.get("spark.databricks.workspaceUrl")

# Define mapping of known workspace URLs to Key Vault/environment names
workspace_mapping = {
    "adb-3635282203417052.12.azuredatabricks.net": "ingest00-meta002-sbox",
    "adb-376876256300083.3.azuredatabricks.net": "ingest01-meta002-sbox",
    "adb-1879076228317698.18.azuredatabricks.net": "ingest02-meta002-sbox",
    "adb-4305432441461530.10.azuredatabricks.net": "ingest00-meta002-stg",
    "adb-3100629970551492.12.azuredatabricks.net": "ingest00-meta002-prod"
}

# Fail if the current workspace URL is not found in the mapping
if workspace_url not in workspace_mapping:
    raise ValueError(f"Unrecognized Databricks workspace URL: {workspace_url}")

# Retrieve the corresponding Key Vault/environment name
KeyVault_name = workspace_mapping[workspace_url]

# Print the resolved Key Vault/environment name
print(f"Workspace URL maps to Key Vault: {KeyVault_name}")


In [0]:
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()

print(f"env_code: {lz_key}")  # This won't be redacted
print(f"env_name: {env_name}")  # This won't be redacted

KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
print(f"KeyVault_name: {KeyVault_name}") 

In [0]:
# Service principal credentials
client_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-TENANT-ID")

# Storage account names
curated_storage = f"ingest{lz_key}curated{env_name}"

# Spark config for curated storage (Delta table)
spark.conf.set(f"fs.azure.account.auth.type.{curated_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{curated_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{curated_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{curated_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{curated_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")


In [0]:
audit_delta_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ARM/AUDIT/JOH/joh_cr_audit_table"
hive_schema = "ariadm_arm_joh"

In [0]:


def datetime_uuid():
    dt_str = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
    return str(uuid.uuid5(uuid.NAMESPACE_DNS, dt_str))

run_id_value = datetime_uuid()

audit_schema = StructType([
    StructField("Run_Id", StringType(), True),
    StructField("Unique_Identifier_Desc", StringType(), True),
    StructField("Unique_Identifier", StringType(), True),
    StructField("Table_Name", StringType(), True),
    StructField("Stage_Name", StringType(), True),
    StructField("Record_Count", IntegerType(), True),
    StructField("Run_DateTime", TimestampType(), True),
    StructField("Batch_Id", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("File_Name", StringType(), True),
    StructField("Status", StringType(), True)
])

def create_audit_df(df: DataFrame, Unique_Identifier_Desc: str,Table_Name: str, Stage_Name: str, description: str, File_Name = False,status = False) -> None:
    """
    Creates an audit DataFrame and writes it to Delta format.

    :param df: Input DataFrame from which unique identifiers are extracted.
    :param Unique_Identifier_Desc: Column name that acts as a unique identifier.
    :param Table_Name: Name of the source table.
    :param Stage_Name: Name of the data processing stage.
    :param description: Description of the table.
    :param additional_columns: options File_Name or Status. List of additional columns to include in the audit DataFrame.
    """

    dt_desc = datetime.utcnow()

    additional_columns = []
    if File_Name is True:
        additional_columns.append("File_Name")
    if status is True:
        additional_columns.append("Status")


     # Default to an empty list if None   
    additional_columns = [col(c) for c in additional_columns if c is not None]  # Filter out None values

    audit_df = df.select(col(Unique_Identifier_Desc).alias("Unique_Identifier"),*additional_columns)\
    .withColumn("Run_Id", lit(run_id_value))\
        .withColumn("Unique_Identifier_Desc", lit(Unique_Identifier_Desc))\
            .withColumn("Stage_Name", lit(Stage_Name))\
                .withColumn("Table_Name", lit(Table_Name))\
                    .withColumn("Run_DateTime", lit(dt_desc).cast(TimestampType()))\
                        .withColumn("Description", lit(description))

    list_cols = audit_df.columns

    final_audit_df = audit_df.groupBy(*list_cols).agg(count("*").cast(IntegerType()).alias("Record_Count"))

    # final_audit_df.write.format("delta").mode("append").option("mergeSchema","true").save(audit_delta_path)
    
    return final_audit_df

In [0]:
# Define Delta Table Path in Azure Storage


if not DeltaTable.isDeltaTable(spark, audit_delta_path):
    print(f"🛑 Delta table '{audit_delta_path}' does not exist. Creating an empty Delta table...")

    # Create an empty DataFrame
    empty_df = spark.createDataFrame([], audit_schema)

    # Write the empty DataFrame in Delta format to create the table
    empty_df.write.format("delta").mode("overwrite").save(audit_delta_path)

    print("✅ Empty Delta table successfully created in Azure Storage.")
else:
    print(f"⚡ Delta table '{audit_delta_path}' already exists.")

In [0]:
audit_params = [
        {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "bronze_adjudicator_et_hc_dnur",
        "Stage_Name": "bronze_stage",
        "description": "Combines adjudicator data with hearing centre, employment terms, and do not use reason details. Provides a standardized view of adjudicators, their designated centres, employment terms, judicial status, and restrictions. Includes metadata such as source files, modification timestamps, and process tracking for auditing."
    },
    {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "bronze_johistory_users",
        "Stage_Name": "bronze_stage",
        "description": "Combines JoHistory records with user details, providing historical adjudicator activity along with corresponding user information. Includes comments, user names, audit timestamps, source filenames, and process metadata for tracking and auditing."
    },
    {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "bronze_othercentre_hearingcentre",
        "Stage_Name": "bronze_stage",
        "description": "Combines OtherCentre records with HearingCentre details, linking adjudicators to their assigned hearing centres. Includes metadata such as timestamps, source files, and process tracking for auditing and traceability."
    },
    {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "bronze_adjudicator_role",
        "Stage_Name": "bronze_stage",
        "description": "Filters and segments adjudicators, retaining only those who are not assigned roles 7 or 8, or have no assigned role. Uses data from bronze_adjudicator_et_hc_dnur and bronze_adjudicator_role to generate a complete list of adjudicators."
    },
    {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "stg_joh_filtered",
        "Stage_Name": "segmentation_stage",
        "description": "Filters and segments adjudicators, retaining only those who are not assigned roles 7 or 8, or have no assigned role. Uses data from bronze_adjudicator_et_hc_dnur and bronze_adjudicator_role to generate a complete list of adjudicators."
    },
    {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "silver_adjudicator_detail",
        "Stage_Name": "silver_stage",
        "description": "Filters adjudicators based on segmentation criteria and enriches their records with Hearing Centre and Do Not Use Reason (DNUR) details. Standardizes key attributes such as correspondence address, contact details, employment terms, and judicial status."
    },
    {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "silver_history_detail",
        "Stage_Name": "silver_stage",
        "description": "Filters and enhances historical adjudicator activity records by incorporating user details. Maps history types to their corresponding descriptions, providing a structured view of events such as allocations, case updates, and administrative actions."
    },
    {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "silver_othercentre_detail",
        "Stage_Name": "silver_stage",
        "description": "Filters and enhances OtherCentre records by applying adjudicator segmentation criteria. Retains adjudicators linked to Hearing Centres while ensuring completeness through process metadata, timestamps, and source tracking."
    },
    {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "silver_appointment_detail",
        "Stage_Name": "silver_stage",
        "description": "Filters and enhances adjudicator role records by mapping role codes to their descriptions. Provides a structured view of adjudicator appointments, including start and end dates, and metadata for auditing."
    },
    {
        "Unique_Identifier_cols": ["client_identifier"],
        "Table_Name": "silver_archive_metadata",
        "Stage_Name": "silver_stage",
        "description": "Metadata table of adjudicator records by combining various metadata fields. Provides a structured view of adjudicator details, including event dates, region, and other relevant information for archival purposes."
    },
    {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "stg_judicial_officer_combined",
        "Stage_Name": "staging_stage",
        "description": "Metadata table of adjudicator records by combining various metadata fields. Provides a structured view of adjudicator details, including event dates, region, and other relevant information for archival purposes."
    },
    {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "stg_create_joh_json_content",
        "Stage_Name": "staging_stage",
        "description": "Generates JSON-formatted adjudicator records for gold-level outputs. Creates structured JSON content for each adjudicator and assigns a filename. Tracks JSON creation status to identify failures and successful transformations",
        "extra_columns": ["File_Name", "Status"]
    },
    {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "stg_create_joh_html_content",
        "Stage_Name": "staging_stage",
        "description": "Generates HTML-formatted adjudicator records for gold-level outputs. Uses a UDF to transform data into structured HTML content and assigns a filename. Tracks HTML creation status to identify failures and succe",
        "extra_columns": ["File_Name", "Status"]
    },
    {
        "Unique_Identifier_cols": ["client_identifier"],
        "Table_Name": "stg_create_joh_a360_content",
        "Stage_Name": "staging_stage",
        "description": "Generates A360-formatted adjudicator records for gold-level outputs. Uses a UDF to transform metadata into A360 content and assigns processing statuses. Supports Hive-based retrieval for non-initial loads, ensuring comprehensive archival integration.",
        "extra_columns": ["Status"]
    },
    {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "gold_judicial_officer_with_json",
        "Stage_Name": "Gold_stage",
        "description": "Final gold-level table integrating adjudicator records with validated HTML content. Ensures data integrity by enforcing error-free JSON content. Optimizes processing through repartitioning and triggers upload operations for structured archival and distribution.",
        "extra_columns": ["File_Name", "Status"]
    },
    {
        "Unique_Identifier_cols": ["AdjudicatorId"],
        "Table_Name": "gold_judicial_officer_with_html",
        "Stage_Name": "Gold_stage",
        "description": "Final gold-level table integrating adjudicator records with validated JSON content. Ensures data integrity by enforcing error-free HTML content. Optimizes processing through repartitioning and triggers upload operations for structured archival and distribution.",
        "extra_columns": ["File_Name", "Status"]
    },
    {
        "Unique_Identifier_cols": ["A360_BatchId"],
        "Table_Name": "gold_judicial_officer_with_a360",
        "Stage_Name": "Gold_stage",
        "description": "Final gold-level table consolidating adjudicator A360 content for structured archival and processing. Ensures data integrity by filtering out records with errors in A360 content. Aggregates and batches records, optimizes processing through repartitioning, and triggers upload operations.",
        "extra_columns": ["File_Name", "Status"]
    }
]

audit_dataframes = []

for params in audit_params:
    Table_Name = params["Table_Name"]
    Stage_Name = params["Stage_Name"]
    Unique_Identifier_cols = params["Unique_Identifier_cols"]
    description = params["description"]
    extra_columns = params["extra_columns"] if "extra_columns" in params else []
    Unique_Identifier_Desc = "_".join(Unique_Identifier_cols)

    try:

        df_logging = spark.read.table(f"hive_metastore.{hive_schema}.{Table_Name}")

        df_audit = df_logging
        if len(Unique_Identifier_cols) > 1:
            df_audit = df_audit.withColumn(
                Unique_Identifier_Desc, 
                concat_ws("_", *[col(c).cast("string") for c in Unique_Identifier_cols])
            )
        else:
            df_audit = df_audit.withColumn(Unique_Identifier_Desc, col(Unique_Identifier_Desc).cast("string"))

     
        # Apply extra column mappings dynamically
        if len(extra_columns) <= 1:
            missing_columns = list(set(["File_Name", "Status"]) - set(extra_columns))
            for new_col in missing_columns:
                df_audit = df_audit.withColumn(new_col, lit(None))

        # Generate the audit DataFrame
        df_audit_appended = create_audit_df(
            df_audit,
            Unique_Identifier_Desc=Unique_Identifier_Desc,
            Table_Name=Table_Name,
            Stage_Name=Stage_Name,
            description=description,
            File_Name = True,
            status = True
        )

        audit_dataframes.append(df_audit_appended)

        print(f"✅ Successfully processed table: {Table_Name}")

    except Exception as e:
        print(f"🛑 Failed to process table: {Table_Name}. Error: {str(e)}")
        failed_table = f"Table {Table_Name} does not exist"


        # Table does not exist, create an audit entry for it
        status = f"Failed - Table {Table_Name} does not exist"

        row_data = {
            "Run_Id": run_id_value,
            "Unique_Identifier_Desc": Unique_Identifier_Desc,
            "Unique_Identifier": None,
            "Table_Name": Table_Name,
            "Stage_Name": Stage_Name,
            "Record_Count": 0,
            "Run_DateTime": datetime.now(),
            "Batch_Id": None,
            "Description": description,
            "File_Name": None,
            "Status": status
        }

        row_df = spark.createDataFrame([row_data], schema=audit_schema)
        audit_dataframes.append(row_df)


df_final_audit = audit_dataframes[0]
for df in audit_dataframes[1:]:
    df_final_audit = df_final_audit.unionByName(df, allowMissingColumns=True)



In [0]:
df_final_audit.write.format("delta").mode("append").option("mergeSchema", "true").save(audit_delta_path)

In [0]:
dbutils.notebook.exit("Notebook completed successfully")

## Appendix

In [0]:

# df_final_audit.createOrReplaceTempView("tv_final_audit")

In [0]:
# df_final_audit = spark.read.format("delta").load(audit_delta_path)
# df_final_audit.createOrReplaceTempView("tv_final_audit")

In [0]:

# %sql
# select *
# --  Status, count(*) 
#  from tv_final_audit
# where Table_Name like 'gold_judicial_officer_with_json%'
# -- group by all