# ARM Acknowledgment 


**Autoloader set up**  
This Notebook sets up an Autoloader job that runs on a manual trigger to collect ack messages from the ack eventhubs


In [0]:
import json
import time
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, LongType
from pyspark.sql.functions import col,from_json
import json
import logging

In [0]:
logger = logging.getLogger("DatabricksWorkflow")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)

In [0]:
ack_schema = StructType([
    StructField("filename", StringType(), True),
    StructField("http_response", IntegerType(),True),
    StructField("timestamp", TimestampType(), True),
    StructField("http_message", StringType(), True)
])

## Set up configs

In [0]:
#Load configuration JSON
config_path = "dbfs:/configs/config.json"
try:
    config = spark.read.option("multiline", "true").json(config_path)
    logger.info(f"Successfully read config file from {config_path}")
except Exception as e:
    logger.error(f"Could not read config file at {config_path}: {e}", exc_info=True)
    raise FileNotFoundError(f"Could not read config file at {config_path}: {e}")

#Extract environment and lz_key
try:
    first_row = config.first()
    env = first_row["env"].strip().lower()
    lz_key = first_row["lz_key"].strip().lower()
    logger.info(f"Extracted configs: env={env}, lz_key={lz_key}")
except Exception as e:
    logger.error(f"Missing expected keys 'env' or 'lz_key' in config file: {e}", exc_info=True)
    raise KeyError(f"Missing expected keys 'env' or 'lz_key' in config file: {e}")

#Construct keyvault name
try:
    keyvault_name = f"ingest{lz_key}-meta002-{env}"
    logger.info(f"Constructed keyvault name: {keyvault_name}")
except Exception as e:
    logger.error(f"Error constructing keyvault name: {e}", exc_info=True)
    raise ValueError(f"Error constructing keyvault name: {e}")


In [0]:
# Access the Service Principal secrets from Key Vault
try:
    client_secret = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-SECRET')
    logger.info("Successfully retrieved SERVICE-PRINCIPLE-CLIENT-SECRET from Key Vault")
except Exception as e:
    logger.error(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-SECRET' from Key Vault '{keyvault_name}': {e}", exc_info=True)
    raise KeyError(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-SECRET' from Key Vault '{keyvault_name}': {e}")

try:
    tenant_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-TENANT-ID')
    logger.info("Successfully retrieved SERVICE-PRINCIPLE-TENANT-ID from Key Vault")
except Exception as e:
    logger.error(f"Could not retrieve 'SERVICE-PRINCIPLE-TENANT-ID' from Key Vault '{keyvault_name}': {e}", exc_info=True)
    raise KeyError(f"Could not retrieve 'SERVICE-PRINCIPLE-TENANT-ID' from Key Vault '{keyvault_name}': {e}")

try:
    client_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-ID')
    logger.info("Successfully retrieved SERVICE-PRINCIPLE-CLIENT-ID from Key Vault")
except Exception as e:
    logger.error(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-ID' from Key Vault '{keyvault_name}': {e}", exc_info=True)
    raise KeyError(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-ID' from Key Vault '{keyvault_name}': {e}")

logger.info("✅ Successfully retrieved all Service Principal secrets from Key Vault")


In [0]:
EH_NAMESPACE = f"ingest{lz_key}-integration-eventHubNamespace001-{env}"
EH_NAME = f"evh-bl-ack-{lz_key}-uks-dlrm-01"

In [0]:
connection_string = dbutils.secrets.get(keyvault_name, "RootManageSharedAccessKey")

KAFKA_OPTIONS = {
    "kafka.bootstrap.servers": f"{EH_NAMESPACE}.servicebus.windows.net:9093",
    "subscribe": EH_NAME,
    "startingOffsets": "earliest",
    "kafka.security.protocol": "SASL_SSL",
    "failOnDataLoss": "false",
    "kafka.sasl.mechanism": "PLAIN",
    "kafka.sasl.jaas.config": f'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="{connection_string}";'
}

In [0]:
# --- Parameterise containers ---
curated_storage_account = f"ingest{lz_key}curated{env}"
curated_container = "gold"
silver_curated_container = "silver"
checkpoint_storage_account = f"ingest{lz_key}xcutting{env}"

# --- Assign OAuth to storage accounts ---
storage_accounts = [curated_storage_account, checkpoint_storage_account]

for storage_account in storage_accounts:
    try:
        configs = {
            f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net": "OAuth",
            f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net":
                "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
            f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net": client_id,
            f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net": client_secret,
            f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net":
                f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
        }

        for key, val in configs.items():
            try:
                spark.conf.set(key, val)
            except Exception as e:
                logger.error(f"Failed to set Spark config '{key}' for storage account '{storage_account}': {e}", exc_info=True)
                raise RuntimeError(f"Failed to set Spark config '{key}' for storage account '{storage_account}': {e}")

        logger.info(f"✅ Successfully configured OAuth for storage account: {storage_account}")

    except Exception as e:
        logger.error(f"Error configuring OAuth for storage account '{storage_account}': {e}", exc_info=True)
        raise RuntimeError(f"Error configuring OAuth for storage account '{storage_account}': {e}")


In [0]:
# Container and path for storing Delta table (in curated storage account)
data_path = f"abfss://silver@ingest{lz_key}curated{env}.dfs.core.windows.net/ARIADM/ARM/AUDIT/BAILS/bl_ack_audit_table"

# Container and path for checkpoint (in xcuttings storage account)
checkpoint_path = f"abfss://db-ack-checkpoint@ingest{lz_key}xcutting{env}.dfs.core.windows.net/BAILS/ACK/ack"


In [0]:
gold_data_path = f"abfss://gold@ingest{lz_key}curated{env}.dfs.core.windows.net/ARIADM/ARM/BAILS/"
# HTML Count
html_path = f"{gold_data_path}HTML/*.html"
html_df = spark.read.format("binaryFile").load(html_path)

json_path = f"{gold_data_path}JSON/*.json"
json_df = spark.read.format("binaryFile").load(json_path)

a360_path = f"{gold_data_path}A360/*.a360"
a360_df = spark.read.format("binaryFile").load(a360_path)

expected_html = html_df.count()
expected_json = json_df.count()
expected_a360 = a360_df.count()

logger.info(f"Expected HTML: {expected_html}")
logger.info(f"Expected JSON: {expected_json}")
logger.info(f"Expected A360: {expected_a360}")

#1a. how many records in source data? (prod aria db) or we can use outputs of segmentation tables eg gold tables
#1b json content, html content should be the same as what we have established in the source
#1c a360 is %% (mod) 250
## expected pre-publish

In [0]:
eventhubdf = spark.readStream.format("kafka")\
    .options(**KAFKA_OPTIONS)\
        .load()

In [0]:
parsed_df = (
    eventhubdf
    # 'body' is binary, so we cast to string (assuming UTF-8)
    .select(col("value").cast("string").alias("json_str"))
    .select(from_json(col("json_str"), ack_schema).alias("json_obj"))
    .select("json_obj.*")
)

In [0]:
def get_processed_counts():
    df = spark.read.format("delta").load(data_path)
    
    html_count = df.filter(col("filename").endswith(".html")).select("filename").distinct().count()
    json_count = df.filter((col("filename")).endswith(".json")).select("filename").distinct().count()
    a360_count = df.filter((col("filename")).endswith(".a360")).select("filename").distinct().count()

    return html_count, json_count, a360_count

In [0]:
query = parsed_df.writeStream \
    .format("delta") \
    .option("checkpointLocation", checkpoint_path) \
    .outputMode("append") \
    .start(data_path)

while query.isActive:
    html_count, json_count, a360_count = get_processed_counts()
    logger.info(f"\nStatus HTML: {html_count}/{expected_html} \nStatus JSON: {json_count}/{expected_json} \nStatus A360: {a360_count}/{expected_a360}")

    if (
        html_count >= expected_html
        and json_count >= expected_json
        and a360_count >= expected_a360
    ):
        logger.info("All files processed")
        query.stop()

In [0]:
silver_container = "silver"
df_bails_ack_audit_data = spark.read.format("delta").load(f"abfss://{silver_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ARM/AUDIT/BAILS/bl_ack_audit_table/")
df_bails_ack_audit_data.createOrReplaceTempView("bails_acknowledge_audit_data")

In [0]:
%sql 
with window_func as (
SELECT filename,
            CASE WHEN http_response = 201 THEN 'Success' ELSE 'Failure' END AS http_response_status,
            timestamp,
            ROW_NUMBER() OVER (PARTITION BY filename ORDER BY timestamp DESC) as rn

FROM bails_acknowledge_audit_data)
SELECT filename, 
        http_response_status,
        timestamp
FROM window_func where rn=1 and filename LIKE '%.a360'



In [0]:
# max_attempt = 5
# delay = 60 # seconds

# for attempt in range(1,max_attempt):
#     try:
#         html_count, json_count, a360_count = get_processed_counts()
#     except Exception as e:
#         if attempt < max_attempt:
#             print(f"Attempt {attempt} failed: {e}. Retrying in {delay} seconds... ")
#             time.sleep(60)
#         else:
#             print("Failed to get processed counts after {max_attempt} attempts: {e}")
#             raise




In [0]:
%sql 
with window_func as (
SELECT filename,
            CASE WHEN http_response = 201 THEN 'Success' ELSE 'Failure' END AS http_response_status,
            timestamp,
            ROW_NUMBER() OVER (PARTITION BY filename ORDER BY timestamp DESC) as rn

FROM bails_acknowledge_audit_data)
SELECT filename, 
        http_response_status,
        timestamp
FROM window_func where rn=1 and filename LIKE '%.json'

In [0]:
%sql 
with window_func as (
SELECT filename,
            CASE WHEN http_response = 201 THEN 'Success' ELSE 'Failure' END AS http_response_status,
            timestamp,
            ROW_NUMBER() OVER (PARTITION BY filename ORDER BY timestamp DESC) as rn

FROM bails_acknowledge_audit_data)
SELECT filename, 
        http_response_status,
        timestamp
FROM window_func where rn=1 and filename LIKE '%.html'

In [0]:
spark.sql(f"""
    WITH results AS (
        SELECT 
            split_part(filename, '.', -1) AS file_extension,
            COUNT(CASE WHEN http_response = 201 AND http_message = 'Created' THEN 1 END) AS count_of_successful_eventhub_responses,
            COUNT(CASE WHEN http_response <> 201 AND http_message <> 'Created' THEN 1 END) AS count_of_unsuccessful_eventhub_responses
        FROM bails_acknowledge_audit_data
        
        WHERE split_part(filename, '.', -1) = 'html'
        GROUP BY split_part(filename, '.', -1)
    )
    SELECT
        file_extension AS file_type,
        {expected_html} AS total_expected_html_eventhub_responses,
        CASE WHEN file_extension = 'html' THEN count_of_successful_eventhub_responses ELSE NULL END AS `total_sent_html_eventhub_responses`,
        CASE WHEN file_extension = 'html' THEN concat(ROUND((count_of_successful_eventhub_responses / {expected_html}) * 100, 2), "%") ELSE NULL END AS `%total_expected_html_eventhub_responses`

    FROM results
""").display()

Databricks visualization. Run in Databricks to view.

In [0]:
spark.sql(f"""
    WITH results AS (
        SELECT 
            split_part(filename, '.', -1) AS file_extension,
            COUNT(CASE WHEN http_response = 201 AND http_message = 'Created' THEN 1 END) AS count_of_successful_eventhub_responses,
            COUNT(CASE WHEN http_response <> 201 AND http_message <> 'Created' THEN 1 END) AS count_of_unsuccessful_eventhub_responses
        FROM bails_acknowledge_audit_data
        WHERE split_part(filename, '.', -1) = 'json'
        GROUP BY split_part(filename, '.', -1)
    )
    SELECT
        file_extension AS file_type,
        {expected_html} AS total_expected_html_eventhub_responses,
        CASE WHEN file_extension = 'json' THEN count_of_successful_eventhub_responses ELSE NULL END AS `total_sent_json_eventhub_responses`,
        CASE WHEN file_extension = 'json' THEN concat(ROUND((count_of_successful_eventhub_responses / {expected_html}) * 100, 2), "%") ELSE NULL END AS `%total_expected_json_eventhub_responses`
    FROM results
""").display()

Databricks visualization. Run in Databricks to view.

In [0]:
spark.sql(f"""
    WITH results AS (
        SELECT 
            split_part(filename, '.', -1) AS file_extension,
            COUNT(CASE WHEN http_response = 201 AND http_message = 'Created' THEN 1 END) AS count_of_successful_eventhub_responses,
            COUNT(CASE WHEN http_response <> 201 AND http_message <> 'Created' THEN 1 END) AS count_of_unsuccessful_eventhub_responses
        FROM bails_acknowledge_audit_data
        WHERE split_part(filename, '.', -1) = 'a360'
        GROUP BY split_part(filename, '.', -1)
    )
    SELECT
        file_extension AS file_type,
        {expected_a360} AS total_expected_a360_eventhub_responses,
        CASE WHEN file_extension = 'a360' THEN CEIL(count_of_successful_eventhub_responses, {expected_a360}) ELSE NULL END AS `count_of_successful_a360_responses`,
        CASE WHEN file_extension = 'a360' THEN concat(ROUND((count_of_successful_eventhub_responses / {expected_a360}) * 100, 2), "%") ELSE NULL END AS `%total_expected_a360_eventhub_responses`
        
        FROM results
""").display()

Databricks visualization. Run in Databricks to view.

In [0]:
silver_container = "silver"
bl_cr_audit_table = spark.read.format("delta").load(f"abfss://{silver_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ARM/AUDIT/BAILS/bl_cr_audit_table/")
bl_cr_audit_table.createOrReplaceTempView("bl_cr_audit_table")

In [0]:
%sql
WITH CTE AS (
  SELECT 
    COUNT(Unique_Identifier) AS count_of_gold_records, 
    Table_Name, 
    Stage_Name 
  FROM bl_cr_audit_table 
  WHERE Stage_Name IN ('segmentation_stage', 'gold_stage') 
    AND Table_Name IN (
      'silver_normal_bail', 
      'create_bails_json_content', 
      'create_bails_html_content', 
      'gold_bails_a360'
    )
  GROUP BY Table_Name, Stage_Name
  ORDER BY Stage_Name DESC
)
SELECT
  count_of_gold_records,
  Table_Name,
  Stage_Name
FROM CTE

In [0]:
df_bails_pub_audit_db_eh_audit_data = spark.read.format("delta").load(f"abfss://{silver_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ARM/AUDIT/BAILS/bl_pub_audit_table/")
df_bails_pub_audit_db_eh_audit_data.createOrReplaceTempView("bl_publish_audit_db_eh_data")

spark.sql(f"""
      SELECT 
      split_part(file_name, '.', -1) as file_extension,
      {expected_html} as total_expected_html_eventhub_responses,
      {expected_json} as total_expected_json_eventhub_responses,
      {expected_a360} as total_expected_a360_eventhub_responses,
      COUNT(CASE WHEN status = 'success' THEN 1 END) AS count_of_successful_eventhub_responses,
      COUNT(CASE WHEN status <> 'success' THEN 1 END) AS count_of_unsuccessful_eventhub_responses,
      concat(((count_of_successful_eventhub_responses/total_expected_html_eventhub_responses) * 100), "%") as `%_of_successful_eventhub_responses`

FROM bl_publish_audit_db_eh_data
GROUP BY file_extension
""").display()

In [0]:
spark.sql(f"""
    WITH results AS (
        SELECT 
            split_part(file_name, '.', -1) AS file_extension,
            COUNT(CASE WHEN status = 'success' THEN 1 END) AS count_of_successful_eventhub_responses,
            COUNT(CASE WHEN status != 'success' THEN 1 END) AS count_of_unsuccessful_eventhub_responses
        FROM bl_publish_audit_db_eh_data
        WHERE split_part(file_name, '.', -1) = 'a360'
        GROUP BY split_part(file_name, '.', -1)
    )
    SELECT
        file_extension AS file_type,
        {expected_a360} AS total_expected_a360_eventhub_responses,
        count_of_successful_eventhub_responses AS count_of_successful_a360_responses,
        concat(
            ROUND(
                (count_of_successful_eventhub_responses / {expected_a360}) * 100, 2
            ), "%"
        ) AS `%total_expected_a360_eventhub_responses`
    FROM results
""").display()

Databricks visualization. Run in Databricks to view.

In [0]:
spark.sql(f"""
    WITH results AS (
        SELECT 
            split_part(file_name, '.', -1) AS file_extension,
            COUNT(CASE WHEN status = 'success' THEN 1 END) AS count_of_successful_eventhub_responses,
            COUNT(CASE WHEN status != 'success' THEN 1 END) AS count_of_unsuccessful_eventhub_responses
        FROM bl_publish_audit_db_eh_data
        WHERE split_part(file_name, '.', -1) = 'html'
        GROUP BY split_part(file_name, '.', -1)
    )
    SELECT
        file_extension AS file_type,
        {expected_html} AS total_expected_html_eventhub_responses,
        count_of_successful_eventhub_responses AS count_of_successful_html_responses,
        concat(
            ROUND(
                (count_of_successful_eventhub_responses / {expected_html}) * 100, 2
            ), "%"
        ) AS `%total_expected_html_eventhub_responses`
    FROM results
""").display()

Databricks visualization. Run in Databricks to view.

In [0]:
spark.sql(f"""
    WITH results AS (
        SELECT 
            split_part(file_name, '.', -1) AS file_extension,
            COUNT(CASE WHEN status = 'success' THEN 1 END) AS count_of_successful_eventhub_responses,
            COUNT(CASE WHEN status != 'success' THEN 1 END) AS count_of_unsuccessful_eventhub_responses
        FROM bl_publish_audit_db_eh_data
        WHERE split_part(file_name, '.', -1) = 'json'
        GROUP BY split_part(file_name, '.', -1)
    )
    SELECT
        file_extension AS file_type,
        {expected_json} AS total_expected_json_eventhub_responses,
        count_of_successful_eventhub_responses AS count_of_successful_json_responses,
        concat(
            ROUND(
                (count_of_successful_eventhub_responses / {expected_json}) * 100, 2
            ), "%"
        ) AS `%total_expected_json_eventhub_responses`
    FROM results
""").display()

Databricks visualization. Run in Databricks to view.

In [0]:
dbutils.notebook.exit("Notebook completed successfully")