In [0]:
# %pip install confluent-kafka #required by job cluster until we deploy via DABs

In [0]:
import uuid
import time
import logging
from datetime import datetime
from functools import reduce
import os

from pyspark.sql import SparkSession, DataFrame, Row
from pyspark.sql.functions import col, decode, split, element_at, lit, from_json, regexp_replace, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType

from confluent_kafka import Producer

import json
from itertools import islice
import numpy as np

In [0]:
logger = logging.getLogger("DatabricksWorkflow")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)

In [0]:
# --- Load configuration JSON ---
config_path = "dbfs:/configs/config.json"
try:
    config = spark.read.option("multiline", "true").json(config_path)
    logger.info(f"Successfully read config file from {config_path}")
except Exception as e:
    logger.error(f"Could not read config file at {config_path}: {e}", exc_info=True)
    raise FileNotFoundError(f"Could not read config file at {config_path}: {e}")

# --- Extract environment and lz_key ---
try:
    first_row = config.first()
    env = first_row["env"].strip().lower()
    lz_key = first_row["lz_key"].strip().lower()
    logger.info(f"Extracted configs: env={env}, lz_key={lz_key}")
except Exception as e:
    logger.error(f"Missing expected keys 'env' or 'lz_key' in config file: {e}", exc_info=True)
    raise KeyError(f"Missing expected keys 'env' or 'lz_key' in config file: {e}")

# --- Construct keyvault name ---
try:
    keyvault_name = f"ingest{lz_key}-meta002-{env}"
    logger.info(f"Constructed keyvault name: {keyvault_name}")
except Exception as e:
    logger.error(f"Error constructing keyvault name: {e}", exc_info=True)
    raise ValueError(f"Error constructing keyvault name: {e}")


In [0]:
# --- Access the Service Principal secrets from Key Vault ---
try:
    client_secret = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-SECRET')
    logger.info("Successfully retrieved SERVICE-PRINCIPLE-CLIENT-SECRET from Key Vault")
except Exception as e:
    logger.error(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-SECRET' from Key Vault '{keyvault_name}': {e}", exc_info=True)
    raise KeyError(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-SECRET' from Key Vault '{keyvault_name}': {e}")

try:
    tenant_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-TENANT-ID')
    logger.info("Successfully retrieved SERVICE-PRINCIPLE-TENANT-ID from Key Vault")
except Exception as e:
    logger.error(f"Could not retrieve 'SERVICE-PRINCIPLE-TENANT-ID' from Key Vault '{keyvault_name}': {e}", exc_info=True)
    raise KeyError(f"Could not retrieve 'SERVICE-PRINCIPLE-TENANT-ID' from Key Vault '{keyvault_name}': {e}")

try:
    client_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-ID')
    logger.info("Successfully retrieved SERVICE-PRINCIPLE-CLIENT-ID from Key Vault")
except Exception as e:
    logger.error(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-ID' from Key Vault '{keyvault_name}': {e}", exc_info=True)
    raise KeyError(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-ID' from Key Vault '{keyvault_name}': {e}")

logger.info("‚úÖ Successfully retrieved all Service Principal secrets from Key Vault")


In [0]:
# --- Parameterise containers ---
curated_storage_account = f"ingest{lz_key}curated{env}"
curated_container = "gold"
silver_curated_container = "silver"
checkpoint_storage_account = f"ingest{lz_key}xcutting{env}"

# --- Assign OAuth to storage accounts ---
storage_accounts = [curated_storage_account, checkpoint_storage_account]

for storage_account in storage_accounts:
    try:
        configs = {
            f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net": "OAuth",
            f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net":
                "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
            f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net": client_id,
            f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net": client_secret,
            f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net":
                f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
        }

        for key, val in configs.items():
            try:
                spark.conf.set(key, val)
            except Exception as e:
                logger.error(f"Failed to set Spark config '{key}' for storage account '{storage_account}': {e}", exc_info=True)
                raise RuntimeError(f"Failed to set Spark config '{key}' for storage account '{storage_account}': {e}")

        logger.info(f"‚úÖ Successfully configured OAuth for storage account: {storage_account}")

    except Exception as e:
        logger.error(f"Error configuring OAuth for storage account '{storage_account}': {e}", exc_info=True)
        raise RuntimeError(f"Error configuring OAuth for storage account '{storage_account}': {e}")


In [0]:
eh_kv_secret = dbutils.secrets.get(scope=keyvault_name, key="RootManageSharedAccessKey")

# Event Hub configurations
eventhubs_hostname = f"ingest{lz_key}-integration-eventHubNamespace001-{env}.servicebus.windows.net:9093"
conf = {
    'bootstrap.servers': eventhubs_hostname,
    'security.protocol': 'SASL_SSL',
    'sasl.mechanism': 'PLAIN',
    'sasl.username': '$ConnectionString',
    'sasl.password': eh_kv_secret,
    'retries': 5,                     # Increased retries
    'enable.idempotence': True,        # Enable idempotent producer #confirm use with ara
}
broadcast_conf = sc.broadcast(conf)

In [0]:
# --- Define schema for result DataFrame ---
result_schema = StructType([
    StructField("RunID", StringType(), True),
    StructField("CaseNo", StringType(), True),
    StructField("Filename", StringType(), True),
    StructField("State", StringType(), True),
    StructField("PublishingDateTime", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Error", StringType(), True)
])

# --- Widgets and state ---
dbutils.widgets.text("state", "default", "State to Process")
state = dbutils.widgets.get("state")
logger.info(f"üîÑ Processing state: {state}")

# --- Define paths ---
curated_storage_account = f"ingest{lz_key}curated{env}"
curated_container = "gold"
silver_curated_container = "silver"

gold_files_base_path = f"abfss://{curated_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{state}/"
silver_base_path = f"abfss://{silver_curated_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{state}/publish_audit_db_eh"

# --- Load files ---
try:
    files = dbutils.fs.ls(gold_files_base_path)[-1]  # newest file
    valid_json = files.path + "JSON/"
    logger.info(f"üìÇ Valid JSON path: {valid_json}")

    try:
        dbutils.fs.ls(valid_json)
    except Exception:
        logger.warning(f"‚ÑπÔ∏è No VALID_JSON directory found for state: {state}")
    else:
        # Load binary files
        binary_df = (
            spark.read.format('binaryFile')
            .option('pathGlobFilter', '*.{html,json}')
            .option('recursiveFileLookup', 'true')
            .load(valid_json)
        )

        # Generate unique RunID per batch
        try:
            logger.info("Attempting to get Databricks context...")

            # Get the context JSON (string)
            context_str = dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson()
            logger.debug(f"Raw context JSON: {context_str}")

            # Parse JSON into a dict
            context = json.loads(context_str)
            tags = context.get("tags", {})

            # Pull jobRunId directly
            run_id = tags.get("jobRunId")
            if run_id:
                logger.info(f"Using jobRunId from tags: {run_id}")
            else:
                logger.warning("jobRunId not found in tags!")

        except Exception as e:
            logger.error(f"Exception retrieving Databricks jobRunId: {e}")
            run_id = None        
        
        # Transform dataframe
        html_df = (
            binary_df
            .withColumn("content_str", decode(col('content'), 'utf-8'))
            .withColumn("file_path", element_at(split(col('path'), '/'), -1))
            .withColumn("State", lit(state))
            .withColumn("CaseNo", regexp_replace(col("file_path"), r"\.json$", ""))
            .withColumn("RunID", lit(run_id))
            .withColumn("PublishingDateTime", current_timestamp())
            .select('RunID', 'CaseNo', 'content_str', 'file_path', 'State')
        )

        display(html_df)

        record_count = html_df.count()
        if record_count == 0:
            print(f"‚ÑπÔ∏è No data to process for state: {state}")
        else:
            print(f"üìä Found {record_count} records for state: {state}")
            optimized_html_df = html_df.repartition(1)

            # --- Partition processing ---
            def process_partition(partition):
                import logging
                from confluent_kafka import Producer
                from datetime import datetime

                success_list = []
                failure_list = []

                producer = Producer(**broadcast_conf.value)

                for row in partition:
                    if row.file_path is None or row.content_str is None:
                        logger.warning(f"Skipping row with missing file_path/content_str: {row}")
                        continue

                    current_CaseNo = row.CaseNo
                    current_state_row = row.State
                    current_RunID = row.RunID
                    current_file_path = row.file_path

                    # --- Closure for callback to capture row-specific variables ---
                    def make_delivery_report(case_no, state, run_id):
                        def delivery_report(err, msg):
                            key_str = msg.key().decode('utf-8') if msg.key() else "Unknown"
                            timestamp = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")
                            if err:
                                failure_list.append((run_id, case_no, key_str, state, timestamp, "ERROR", str(err)))
                                logger.error(f"Message delivery failed for {case_no}: {err}")
                            else:
                                success_list.append((run_id, case_no, key_str, state, timestamp, "SUCCESS", ""))
                                logger.info(f"Message delivered successfully for {case_no}")
                        return delivery_report

                    delivery_callback = make_delivery_report(current_CaseNo, current_state_row, current_RunID)

                    # --- Produce to Kafka ---
                    try:
                        if isinstance(row.content_str, str):
                            value = json.dumps({     #value=row.content_str
                                "RunID": current_RunID,
                                "CaseNo": current_CaseNo,
                                "State": current_state_row,
                                "Filename": current_file_path,
                                "Content": row.content_str
                            }).encode('utf-8')
                            
                        elif isinstance(row.content_str, (bytes, bytearray)):
                            value = json.dumps({
                                        "RunID": current_RunID,
                                        "CaseNo": current_CaseNo,
                                        "State": current_state_row,
                                        "Filename": current_file_path,
                                        "Content": row.content_str.decode('utf-8', errors='ignore')
                                    }).encode('utf-8')
                        else:
                            failure_list.append((current_RunID, current_CaseNo, "Unknown", current_state_row, datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f"), "ERROR", f"Unsupported type {type(row.content_str)}"))
                            continue

                        try: 
                            producer.produce(
                            topic=f'evh-active-pub-{lz_key}-uks-dlrm-01',
                            key=current_file_path.encode('utf-8'),
                            value=value,
                            callback=delivery_callback
                            )
                        
                        except KafkaException as e:
                            logger.error(f"Kafka produce failed (check connectivity!): {e}")

                    except BufferError:
                        logger.error("Producer buffer full.")

                # Flush producer at the end of partition
                try:
                    producer.flush()
                except Exception as e:
                    logger.error(f"Flush error: {e}")

                # Merge results
                results = success_list + failure_list
                return results

            # --- Map partitions and collect results ---
            result_rdd = optimized_html_df.rdd.mapPartitions(process_partition)
            result_df = spark.createDataFrame(result_rdd, result_schema)

            # --- Write results incrementally to Delta ---
            result_df.write.format("delta") \
                .mode("append") \
                .option("mergeSchema", "true") \
                .save(silver_base_path)

            # --- Display results ---
            display(result_df.select("RunID", "CaseNo", "State", "PublishingDateTime", "Status", "Error"))

            # Highlight failures
            failed_df = result_df.filter(col("Status") == "ERROR")
            failed_count = failed_df.count()
            if failed_count > 0:
                logger.error(f"‚ö†Ô∏è Found {failed_count} failed records for state: {state}")
                display(failed_df.select("RunID", "CaseNo", "State", "PublishingDateTime", "Status", "Error"))
            else:
                logger.info(f"‚úÖ No failed records for state: {state}")

            kafka_result_count = result_df.count()
            logger.info(f"üìä Kafka processing completed: {kafka_result_count} records for state: {state}")
            logger.info(f"‚úÖ Successfully sent {record_count} records to Kafka for state: {state}")

except Exception as e:
    logger.error(f"‚ùå Error processing state {state}: {e}")

logger.info(f"üéâ Completed processing for state: {state}")

In [0]:
# --- Define schema for result DataFrame ---
result_schema = StructType([
    StructField("RunID", StringType(), True),
    StructField("CaseNo", StringType(), True),
    StructField("Filename", StringType(), True),
    StructField("State", StringType(), True),
    StructField("PublishingDateTime", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Error", StringType(), True)
])

# --- Widgets and state ---
dbutils.widgets.text("state", "default", "State to Process")
state = dbutils.widgets.get("state")
logger.info(f"üîÑ Processing state: {state}")

# --- Define paths ---
curated_storage_account = f"ingest{lz_key}curated{env}"
curated_container = "gold"
silver_curated_container = "silver"

gold_files_base_path = f"abfss://{curated_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{state}/"
silver_base_path = f"abfss://{silver_curated_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{state}/publish_audit_db_eh"

# --- Load files ---
try:
    files = dbutils.fs.ls(gold_files_base_path)[-1]  # newest file
    valid_json = files.path + "JSON/"
    logger.info(f"üìÇ Valid JSON path: {valid_json}")

    try:
        dbutils.fs.ls(valid_json)
    except Exception:
        logger.warning(f"‚ÑπÔ∏è No VALID_JSON directory found for state: {state}")
    else:
        # Load binary files
        binary_df = (
            spark.read.format('binaryFile')
            .option('pathGlobFilter', '*.{html,json}')
            .option('recursiveFileLookup', 'true')
            .load(valid_json)
        )

        # Generate unique RunID per batch
        try:
            logger.info("Attempting to get Databricks context...")

            # Get the context JSON (string)
            context_str = dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson()
            logger.debug(f"Raw context JSON: {context_str}")

            # Parse JSON into a dict
            context = json.loads(context_str)
            tags = context.get("tags", {})

            # Pull jobRunId directly
            run_id = tags.get("jobRunId")
            if run_id:
                logger.info(f"Using jobRunId from tags: {run_id}")
            else:
                logger.warning("jobRunId not found in tags!")

        except Exception as e:
            logger.error(f"Exception retrieving Databricks jobRunId: {e}")
            run_id = None        
        
        # Transform dataframe
        html_df = (
            binary_df
            .withColumn("content_str", decode(col('content'), 'utf-8'))
            .withColumn("file_path", element_at(split(col('path'), '/'), -1))
            .withColumn("State", lit(state))
            .withColumn("CaseNo", regexp_replace(col("file_path"), r"\.json$", ""))
            .withColumn("RunID", lit(run_id))
            .withColumn("PublishingDateTime", current_timestamp())
            .select('RunID', 'CaseNo', 'content_str', 'file_path', 'State')
        )

        record_count = html_df.count()
        if record_count == 0:
            print(f"‚ÑπÔ∏è No data to process for state: {state}")
        else:
            print(f"üìä Found {record_count} records for state: {state}")
            optimized_html_df = html_df.repartition(1)

            # --- Partition processing ---
            def process_partition(partition):
                import logging
                from confluent_kafka import Producer
                from datetime import datetime

                success_list = []
                failure_list = []

                producer = Producer(**broadcast_conf.value)

                for row in partition:
                    if row.file_path is None or row.content_str is None:
                        logger.warning(f"Skipping row with missing file_path/content_str: {row}")
                        continue

                    current_CaseNo = row.CaseNo
                    current_state_row = row.State
                    current_RunID = row.RunID
                    current_file_path = row.file_path

                    # --- Closure for callback to capture row-specific variables ---
                    def make_delivery_report(case_no, state, run_id):
                        def delivery_report(err, msg):
                            key_str = msg.key().decode('utf-8') if msg.key() else "Unknown"
                            timestamp = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")
                            if err:
                                failure_list.append((run_id, case_no, key_str, state, timestamp, "ERROR", str(err)))
                                logger.error(f"Message delivery failed for {case_no}: {err}")
                            else:
                                success_list.append((run_id, case_no, key_str, state, timestamp, "SUCCESS", ""))
                                logger.info(f"Message delivered successfully for {case_no}")
                        return delivery_report

                    delivery_callback = make_delivery_report(current_CaseNo, current_state_row, current_RunID)

                    # --- Produce to Kafka ---
                    try:
                        if isinstance(row.content_str, str):
                            value = row.content_str.encode('utf-8')
                        elif isinstance(row.content_str, (bytes, bytearray)):
                            value = bytes(row.content_str)
                        else:
                            failure_list.append((current_RunID, current_CaseNo, "Unknown", current_state_row, datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f"), "ERROR", f"Unsupported type {type(row.content_str)}"))
                            continue

                        try: 
                            producer.produce(
                            topic=f'evh-active-pub-{lz_key}-uks-dlrm-01',
                            key=current_file_path.encode('utf-8'),
                            value=value,
                            callback=delivery_callback
                            )
                        
                        except KafkaException as e:
                            logger.error(f"Kafka produce failed (check connectivity!): {e}")

                    except BufferError:
                        logger.error("Producer buffer full.")

                # Flush producer at the end of partition
                try:
                    producer.flush()
                except Exception as e:
                    logger.error(f"Flush error: {e}")

                # Merge results
                results = success_list + failure_list
                return results

            # Map partitions and collect results 
            result_rdd = optimized_html_df.rdd.mapPartitions(process_partition)
            result_df = spark.createDataFrame(result_rdd, result_schema)

            # Write results incrementally to Delta
            result_df.write.format("delta") \
                .mode("append") \
                .option("mergeSchema", "true") \
                .save(silver_base_path)

            display(result_df.select("RunID", "CaseNo", "State", "PublishingDateTime", "Status", "Error"))

            # Highlight failures
            failed_df = result_df.filter(col("Status") == "ERROR")
            failed_count = failed_df.count()
            if failed_count > 0:
                logger.error(f"‚ö†Ô∏è Found {failed_count} failed records for state: {state}")
                display(failed_df.select("RunID", "CaseNo", "State", "PublishingDateTime", "Status", "Error"))
            else:
                logger.info(f"‚úÖ No failed records for state: {state}")

            kafka_result_count = result_df.count()
            logger.info(f"üìä Kafka processing completed: {kafka_result_count} records for state: {state}")
            logger.info(f"‚úÖ Successfully sent {record_count} records to Kafka for state: {state}")

except Exception as e:
    logger.error(f"‚ùå Error processing state {state}: {e}")

logger.info(f"üéâ Completed processing for state: {state}")

In [0]:
publishPayloadResult = spark.read.format("delta"
                ).load(f"abfss://{silver_curated_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{state}/publish_audit_db_eh")

publishPayloadResult.write.format("delta"
                ).mode("append"
                ).save(f"abfss://{silver_curated_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/all_states_combined/publish_audit_db_eh")

publishPayloadResult.createOrReplaceTempView("publishPayloadResult")

In [0]:
%sql
SELECT * FROM publishPayloadResult

In [0]:
dbutils.notebook.exit(f"{state} notebook completed successfully")
logger.info(f"üéâ Completed processing for state: {state}")