In [0]:
%pip install confluent-kafka

In [0]:
from confluent_kafka import Producer
import json
from  itertools import islice
import numpy as np
from pyspark.sql.functions import col, decode, split, element_at, udf, lit, reduce
import logging
from pyspark.sql.types import StructType, StructField, StringType
from pyspark import SparkContext
import datetime
from pyspark.sql import SparkSession, DataFrame
from pyspark import SparkContext
import os
from functools import reduce

In [0]:
## Assign configs
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()

keyvault_name = f"ingest{lz_key}-meta002-{env}"

In [0]:
# Access the Service Principle secrets from keyvaults
client_secret = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-SECRET')
tenant_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-TENANT-ID')
client_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-ID')

In [0]:
## Paramaterise containers
curated_storage_account = f"ingest{lz_key}curated{env}"
curated_container = "gold"
silver_curated_container = "silver"

In [0]:
##Assign OAuth to curated storage account
storage_accounts = [curated_storage_account]

for storage_account in storage_accounts:
    configs = {
            f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net": "OAuth",
            f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net":
                "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
            f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net": client_id,
            f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net": client_secret,
            f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net":
                f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
        }
    for key,val in configs.items():
        spark.conf.set(key,val)

In [0]:
# Print out the auth config for each storage account to confirm
for storage_account in storage_accounts:
    key = f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net"
    print(f"{key}: {spark.conf.get(key, 'MISSING')}")

## ***Read in HTML and JSON files***

In [0]:
######### are we going to be using one single EH for all processing?

eh_kv_secret = dbutils.secrets.get(scope=keyvault_name, key="RootManageSharedAccessKey")

# Event Hub configurations
eventhubs_hostname = f"ingest{lz_key}-integration-eventHubNamespace001-{env}.servicebus.windows.net:9093"
conf = {
    'bootstrap.servers': eventhubs_hostname,
    'security.protocol': 'SASL_SSL',
    'sasl.mechanism': 'PLAIN',
    'sasl.username': '$ConnectionString',
    'sasl.password': eh_kv_secret,
    'retries': 5,                     # Increased retries
    'enable.idempotence': True,        # Enable idempotent producer
}
broadcast_conf = sc.broadcast(conf)

In [0]:
## List the states
states = [
    "paymentPending", 
    "appealSubmitted", 
    "awaitingRespondentEvidence(a)", 
    "awaitingRespondentEvidence(b)", 
    "caseUnderReview", 
    "reasonForAppealSubmitted", 
    "listing",
    "PrepareForHearing",
    "Decision",
    "FTPA Submitted (a)",
    "FTPA Submitted (b)",
    "Decided (b)",
    "Decided (a)",
    "FTPA Decided",
    "Ended",
    "Remitted"
]

valid_json_files = []
invalid_json_files = []
all_html_dfs = []

## Loop through each state and assign file_path to reference JSON files from
for state in states:
    gold_files_base_path = f"abfss://{curated_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{state}/"

    try:
        files = dbutils.fs.ls(gold_files_base_path)[-1] # Index on newest file
        
        valid_json = files.path + "JSON/" # Return only the path. Access only valid JSON files
        valid_json_files.append(valid_json)

        ## Remove once valid_json has outputs from fresh DLT run
        invalid_json = files.path + "INVALID_JSON/"
        try:
            dbutils.fs.ls(invalid_json)
        except Exception:
            print(f"No INVALID_JSON directory found for state: {state}")
            continue
    
        # Load binary data
        binary_df = (
            spark.read.format('binaryFile')
            .option('pathGlobFilter', '*.{html,json}')
            .option('recursiveFileLookup', 'true')
            .load(invalid_json)
        )
        
        # Process data
        html_df = (
            binary_df
            .withColumn("content_str", decode(col('content'), 'utf-8'))
            .withColumn("file_path", element_at(split(col('path'), '/'), -1))
            .withColumn("state", lit(state))
            .select('content_str', 'file_path', 'state')
        )
        
        # Check if we have data to process
        record_count = html_df.count()
        if record_count == 0:
            print(f"No data to process for state: {state}")
            continue
            
        print(f"Found {record_count} records for state: {state}")
        all_html_dfs.append(html_df)
    except Exception as e:
        print(f"Error processing state {state}: {e}")

display([{'path': v} for v in valid_json_files])

if all_html_dfs:
    ## Union all DataFrames produced together by columns
    combined_html_df = reduce(DataFrame.unionByName, all_html_dfs)
    display(combined_html_df)

# Repartition for parallelism
# num_spark_partitions =  16
# optimized_html_df = combined_html_df.repartition(num_spark_partitions, col("state"))

num_spark_partitions =  1
optimized_html_df = combined_html_df.repartition(num_spark_partitions)

In [0]:
# ##List the states
# states = [
#     "paymentPending", 
#     "appealSubmitted", 
#     "awaitingRespondentEvidence(a)", 
#     "awaitingRespondentEvidence(b)", 
#     "caseUnderReview", 
#     "reasonForAppealSubmitted", 
#     "listing",
#     "PrepareForHearing",
#     "Decision",
#     "FTPA Submitted (a)",
#     "FTPA Submitted (b)",
#     "Decided (b)",
#     "Decided (a)",
#     "FTPA Decided",
#     "Ended",
#     "Remitted"
# ]

# valid_json_files = []
# invalid_json_files = []
# all_html_dfs = []

# ##Loop through each state and assign file_path to reference JSON files from
# for state in states:
#     gold_files_base_path = f"abfss://{curated_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{state}/"

#     # Add the if state to avoid errors until other states are populated
#     if state == "paymentPending":
#             files = dbutils.fs.ls(gold_files_base_path)[-1] #Index on newest file
            
#             valid_json = files.path + "JSON/" #Return only the path. Access only valid JSON files
#             valid_json_files.append(valid_json)

#             ## Remove once valid_json has outputs from fresh DLT run
#             invalid_json = files.path + "INVALID_JSON/"
#             invalid_json_files.append(invalid_json)

#             binary_df = (
#             spark.read.format('binaryFile')
#             .option('pathGlobFilter', '*.{html,json}')
#             .option('recursiveFileLookup', 'true')
#             .load(invalid_json)
#             )

#             html_df = (
#             binary_df
#             .withColumn("content_str", decode(col('content'), 'utf-8'))
#             .withColumn("file_path", element_at(split(col('path'), '/'), -1))
#             .withColumn("state", lit(state)) 
#             .select('content_str', 'file_path', 'state')
#             )

#             all_html_dfs.append(html_df)

# display([{'path': v} for v in valid_json_files])

# if all_html_dfs:
#     ##Union all DataFrames produced together by columns
#     combined_html_df = reduce(DataFrame.unionByName, all_html_dfs)
#     display(combined_html_df)

# # Repartition for parallelism
# num_spark_partitions =  16
# optimized_html_df = combined_html_df.repartition(num_spark_partitions, col("state"))

In [0]:
def process_partition(partition):
    import logging
    from confluent_kafka import Producer
    from datetime import datetime

    # Initialize logger
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger('KafkaProducer')
    
    failure_list = []
    success_list = []
    results = []

    # Initialize producer
    producer = Producer(**broadcast_conf.value)

    for row in partition:
        if row.file_path is None or row.content_str is None:
            logger.warning(f"Skipping row with missing file_path/content_str: {row}")
            continue

        ## Use current row for callback
        current_state = row.state
        current_file_path = row.file_path

        def delivery_report(err, msg):
            key_str = msg.key().decode('utf-8') if msg.key() is not None else "Unknown"
            timestamp = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
            
            if err is not None:
                err_msg = str(err)
                logger.error(f"Message delivery failed for key {key_str}: {err}")
                failure_list.append((key_str, current_state, "failure", err_msg, timestamp))
            else:
                success_list.append((key_str, current_state, "success", "", timestamp))

        try:
            # Handle different content_str types
            if isinstance(row.content_str, str):
                value = row.content_str.encode('utf-8')
            elif isinstance(row.content_str, bytearray):
                value = bytes(row.content_str)
            elif isinstance(row.content_str, bytes):
                value = row.content_str
            else:
                logger.error(f"Unsupported type for content_str: {type(row.content_str)}")
                failure_list.append((current_file_path, current_state, "failure", "Unsupported content type", datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))
                continue

            # Produce message to Kafka
            producer.produce(
                topic=f'evh-active-pub-{env}-{lz_key}-uks-dlrm-01',
                key=current_file_path.encode('utf-8'),
                value=value,
                callback=delivery_report
            )

        except BufferError:
            logger.error("Producer buffer full. Polling for events.")
            producer.poll(1)
            # Retry the message production
            try:
                producer.produce(
                    topic=f'evh-active-pub-{env}-{lz_key}-uks-dlrm-01',
                    key=current_file_path.encode('utf-8'),
                    value=value,
                    callback=delivery_report
                )
            except Exception as retry_e:
                logger.error(f"Failed to produce message after buffer retry: {retry_e}")
                failure_list.append((current_file_path, current_state, "failure", f"Buffer error retry failed: {str(retry_e)}", datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))
                
        except Exception as e:
            logger.error(f"Unexpected error during production: {e}")
            failure_list.append((current_file_path, current_state, "failure", str(e), datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))

    # Flush producer and handle any remaining messages
    try:
        producer.flush()
        logger.info("Producer flushed successfully.")
    except Exception as e:
        logger.error(f"Unexpected error during flush: {e}")

    # Combine all results
    results.extend(success_list)
    results.extend(failure_list)

    return results

schema = StructType([
    StructField("file_name", StringType(), True),
    StructField("state", StringType(), True),
    StructField("status", StringType(), True),
    StructField("error_message", StringType(), True),
    StructField("timestamp", StringType(), True)
])

all_results = []

for state_df in all_html_dfs:
    ## Pick up state value (name)
    current_state = state_df.select("state").first()["state"]
    print(f"Starting processing for state: {current_state}")

    result_rdd = state_df.rdd.mapPartitions(process_partition)
    result_df = spark.createDataFrame(result_rdd, schema)

    # Trigger execution & force completion for this state
    count = result_df.count()
    print(f"Completed processing {count} records for state: {current_state}")

    display(result_df)
    all_results.append(result_df)

# Combine final results
if all_results:
    final_results_df = reduce(DataFrame.unionByName, all_results)
    display(final_results_df)
else:
    print("No results generated for any state.")

In [0]:
## Display failed files

failed_files = final_results_df.filter(col("status") == "failure")

display(failed_files)
failed_files.count()

In [0]:
## Filter over the relevant state as all data is merged together and append each state seperately

final_results_df = final_results_df.coalesce(4)
for state in states:
    silver_base_path = f"abfss://{silver_curated_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{state}/publish_audit_db_eh"
    
    (final_results_df
        .filter(col("state") == state)
        .write
        .format("delta")
        .mode("append")
        .save(f"{silver_base_path}")
    )