In [0]:
%pip install confluent_kafka

In [0]:
from confluent_kafka import Producer
import json
from  itertools import islice
import numpy as np
from pyspark.sql.functions import col, decode, split, element_at, udf, lit, reduce, from_json, regexp_replace, concat
import logging
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DateType
import datetime
from pyspark.sql import SparkSession, DataFrame
from pyspark import SparkContext
import os
from functools import reduce
import time
import traceback

In [0]:
logger = logging.getLogger("DatabricksWorkflow")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)

In [0]:
config_path = "dbfs:/configs/config.json"
try:
    config = spark.read.option("multiline", "true").json(config_path)
    logger.info(f"Successfully read config file from {config_path}")
except Exception as e:
    logger.error(f"Could not read config file at {config_path}: {e}", exc_info=True)
    raise FileNotFoundError(f"Could not read config file at {config_path}: {e}")

try:
    first_row = config.first()
    env = first_row["env"].strip().lower()
    lz_key = first_row["lz_key"].strip().lower()
    logger.info(f"Extracted configs: env={env}, lz_key={lz_key}")
except Exception as e:
    logger.error(f"Missing expected keys 'env' or 'lz_key' in config file: {e}", exc_info=True)
    raise KeyError(f"Missing expected keys 'env' or 'lz_key' in config file: {e}")

try:
    keyvault_name = f"ingest{lz_key}-meta002-{env}"
    logger.info(f"Constructed keyvault name: {keyvault_name}")
except Exception as e:
    logger.error(f"Error constructing keyvault name: {e}", exc_info=True)
    raise ValueError(f"Error constructing keyvault name: {e}")


In [0]:

try:
    client_secret = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-SECRET')
    logger.info("Successfully retrieved SERVICE-PRINCIPLE-CLIENT-SECRET from Key Vault")
except Exception as e:
    logger.error(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-SECRET' from Key Vault '{keyvault_name}': {e}", exc_info=True)
    raise KeyError(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-SECRET' from Key Vault '{keyvault_name}': {e}")

try:
    tenant_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-TENANT-ID')
    logger.info("Successfully retrieved SERVICE-PRINCIPLE-TENANT-ID from Key Vault")
except Exception as e:
    logger.error(f"Could not retrieve 'SERVICE-PRINCIPLE-TENANT-ID' from Key Vault '{keyvault_name}': {e}", exc_info=True)
    raise KeyError(f"Could not retrieve 'SERVICE-PRINCIPLE-TENANT-ID' from Key Vault '{keyvault_name}': {e}")

try:
    client_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-ID')
    logger.info("Successfully retrieved SERVICE-PRINCIPLE-CLIENT-ID from Key Vault")
except Exception as e:
    logger.error(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-ID' from Key Vault '{keyvault_name}': {e}", exc_info=True)
    raise KeyError(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-ID' from Key Vault '{keyvault_name}': {e}")

logger.info("✅ Successfully retrieved all Service Principal secrets from Key Vault")


In [0]:
# --- Parameterise containers ---
curated_storage_account = f"ingest{lz_key}curated{env}"
curated_container = "gold"
silver_curated_container = "silver"
checkpoint_storage_account = f"ingest{lz_key}xcutting{env}"

# --- Assign OAuth to storage accounts ---
storage_accounts = [curated_storage_account, checkpoint_storage_account]

for storage_account in storage_accounts:
    try:
        configs = {
            f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net": "OAuth",
            f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net":
                "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
            f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net": client_id,
            f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net": client_secret,
            f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net":
                f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
        }

        for key, val in configs.items():
            try:
                spark.conf.set(key, val)
            except Exception as e:
                logger.error(f"Failed to set Spark config '{key}' for storage account '{storage_account}': {e}", exc_info=True)
                raise RuntimeError(f"Failed to set Spark config '{key}' for storage account '{storage_account}': {e}")

        logger.info(f"✅ Successfully configured OAuth for storage account: {storage_account}")

    except Exception as e:
        logger.error(f"Error configuring OAuth for storage account '{storage_account}': {e}", exc_info=True)
        raise RuntimeError(f"Error configuring OAuth for storage account '{storage_account}': {e}")


In [0]:
ccd_call_result = spark.read.format("delta").load(f"abfss://silver@ingest{lz_key}curated{env}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/AUDIT/APPEALS/all_active_states/ack_audit")
ccd_call_result.createOrReplaceTempView("ccd_call_result")

ccd_publish_payload_result = spark.read.format("delta").load(f"abfss://{silver_curated_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/publish_layload_audit")
ccd_publish_payload_result.createOrReplaceTempView("ccd_publish_payload_result")

spark.sql("""SELECT 
          
        COALESCE(t2.runid, t1.runid) as RunID,
        COALESCE(t2.state, t1.state) as State,
        split_part(COALESCE(t2.CaseNo, t1.CaseNo), ".", 1) as CaseNo,
        t1.status as `Publish Payload Status`,
        t2.status as `CCD Call Status`,
        t1.PublishingDateTime as `CCD Publish Payload Publishing Date Time`,
        t2.error as `CCD Call Function App Error`,
        t2.StartDateTime as `CCD Call Function App Start Date Time`,
        t2.EndDateTime as `CCD Call Function App End Date Time`,
        t2.CCDCaseID as `CCD Case ID`,
        t1.error as `CCD Publish Payload Error`
          
          FROM ccd_publish_payload_result t1
          full outer join ccd_call_result t2 on t2.CaseNo = t1.Filename and t1.state = t2.state

          -- WHERE t2.StartDateTime >= '2025-11-28 11:40:00' and t1.PublishingDateTime >= '2025-11-28 11:40:00'
          -- ORDER BY t1.PublishingDateTime DESC
          
          """).display()
        #   and t1.RunID = t2.RunID uncomment once we have working runs and add to join conditions

In [0]:
valid_json_list = []
invalid_json_list = []

#, "appealSubmitted", "awaitingRespondentEvidence(a)"
# "awaitingRespondentEvidence(b)", "reasonsForAppealSubmitted", "caseUnderReview"
states = ["paymentPending"]

#For each state, retrieve the most recent gold output from DLT invalid + valid files (folder level)
for state in states:
    gold_files_base_path = (
        f"abfss://{curated_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{state}/")
    folders = dbutils.fs.ls(gold_files_base_path)
    if not folders:
        logger.warning(f"No folders found for {state}")
        continue
    latest_folder = folders[-1]
    valid_path = latest_folder.path + "JSON/"
    invalid_path = latest_folder.path + "INVALID_JSON/"

    #Query if any data exists in the file path
    try:
        valid_files = dbutils.fs.ls(valid_path)
        if valid_files:
            valid_json_list.append((valid_path))
        else:
            logger.warning(f"Empty valid JSON folder for {state}")
    except Exception:
        logger.warning(f"No valid JSON folder found for {state}")

    #Query if any data exists in the file path
    try:
        invalid_files = dbutils.fs.ls(invalid_path)
        if invalid_files:
            invalid_json_list.append((invalid_path))
        else:
            logger.warning(f"Empty invalid JSON folder for {state}")
    except Exception:
        logger.warning(f"No invalid JSON folder found for {state}")

valid_df_list = []
invalid_df_list = []

#Retrieve the path for each file in the valid/invalid folders and create a dataframe
for path in valid_json_list:
    valid_files = dbutils.fs.ls(path)

    for f in valid_files:
        valid_df_list.append({"file_name": f.name, "file_path": f.path})

for path in invalid_json_list:
    invalid_files = dbutils.fs.ls(path)

    for f in invalid_files:
        invalid_df_list.append({"file_name": f.name, "file_path": f.path})

#Pull out the state from the file_path
valid_df = spark.createDataFrame(valid_df_list)
valid_df = valid_df.withColumn("State", element_at(split(col("file_path"), "/"), -4))

invalid_df = spark.createDataFrame(invalid_df_list)
invalid_df = invalid_df.withColumn("State", element_at(split(col("file_path"), "/"), -4))

In [0]:
segmentation_df = spark.read.table('hive_metastore.paymentpending_gold.stg_main_payment_pending_validation')
# segmentation_df.display()
segmentation_df = segmentation_df.withColumn("CaseNo", concat(lit("APPEALS_"), regexp_replace(col("CaseNo"), "/", "_"), lit(".json"))).select(col("CaseNo"), col("ariaDesiredState"))
segmentation_df.createOrReplaceTempView("segmentation_state")

In [0]:
#Create views that we can use to query the data
valid_df.createOrReplaceTempView("valid_json_files")
invalid_df.createOrReplaceTempView("invalid_json_files")

## Query to trace data for each state from segmentation (gold layer) through valid/invalid
## Through each layer of the CCD pipeline (publish, case creation, validation, submission, results)
spark.sql('''
        with segmentation_state_cte as (
        SELECT 
        split_part(CaseNo, '.', 1) AS CaseNo,
        ariaDesiredState as State,
        'Yes' as `Segmentation State`
        FROM segmentation_state),

        valid_invalid_json_cte as (
        SELECT
        split_part(file_name, '.', 1) AS CaseNo,
        state AS State,
        'Invalid' AS `Validation Status`
        FROM invalid_json_files

        UNION ALL

        SELECT
        split_part(file_name, '.', 1) AS CaseNo,
        state AS State,
        'Valid' AS `Validation Status`
        FROM valid_json_files),

        pub_payload_call_result_cte as (
        SELECT 
        COALESCE(t2.runid, t1.runid) as RunID,
        COALESCE(t2.state, t1.state) as State,
        split_part(COALESCE(t2.CaseNo, t1.CaseNo), ".", 1) as CaseNo,
        t1.status as `CCD Publish Payload Status`,
        t2.status as `CCD Call Status`,
        t1.PublishingDateTime as `CCD Publish Payload Publishing Date Time`,
        t2.error as `CCD Call Function App Error`,
        t2.StartDateTime as `CCD Call Function App Start Date Time`,
        t2.EndDateTime as `CCD Call Function App End Date Time`,
        t2.CCDCaseID as `CCD Case ID`,
        t1.error as `CCD Publish Payload Error`

        FROM ccd_publish_payload_result t1
        full outer join ccd_call_result t2 on t2.CaseNo = t1.Filename and t1.state = t2.state)

        SELECT
        COALESCE(a.State, b.State) as State, 
        COALESCE(a.CaseNo, b.CaseNo) as CaseNo,
        a.`Segmentation State`,
        b.`Validation Status`,
        d.`CCD Publish Payload Status`,
        d.`CCD Call Status`,
        d.`CCD Publish Payload Publishing Date Time`,
        d.`CCD Publish Payload Error`,
        d.`CCD Call Function App Start Date Time`,
        d.`CCD Call Function App End Date Time`,
        d.`CCD Case ID`,
        d.`CCD Call Function App Error`
        
        FROM segmentation_state_cte a FULL OUTER JOIN valid_invalid_json_cte b on a.CaseNo = b.CaseNo
                                      FULL OUTER JOIN pub_payload_call_result_cte d on a.CaseNo = d.CaseNo

        -- WHERE (d.`CCD Call Function App Start Date Time` >= '2025-11-28 11:40:00' OR d.`CCD Call Function App Start Date Time` IS NULL)
        -- and (d.`CCD Publish Payload Publishing Date Time` >= '2025-11-28 11:40:00' OR d.`CCD Publish Payload Publishing Date Time` IS NULL)

        -- ORDER BY d.`CCD Publish Payload Publishing Date Time` DESC
          
          ''').display()

In [0]:
dbutils.notebook.exit("Notebook completed successfully")