In [0]:
import json
import numpy as np
import logging
import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, element_at, udf, decode, when, upper, lit, concat
from pyspark import SparkContext
from confluent_kafka import Producer
from itertools import islice
from pyspark.sql.types import StructType, StructField, StringType


## Set up globals [configs](url)

In [0]:
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()

In [0]:
keyvault_name = f"ingest{lz_key}-meta002-{env}"

In [0]:
# Access the Service Principle secrets from keyvaults
client_secret = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-SECRET')
tenant_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-TENANT-ID')
client_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-ID')

In [0]:
curated_storage_account = f"ingest{lz_key}curated{env}"
curated_container = "gold"
silver_curated_container = "silver"
segment = "APPEALS/ARIAFTA"
segment_short = "fta"

In [0]:
storage_accounts = [curated_storage_account]

for storage_account in storage_accounts:
    configs = {
            f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net": "OAuth",
            f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net":
                "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
            f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net": client_id,
            f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net": client_secret,
            f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net":
                f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
        }
    for key,val in configs.items():
        spark.conf.set(key,val)

In [0]:
# Print out the auth config for each storage account to confirm
for storage_account in storage_accounts:
    key = f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net"
    print(f"{key}: {spark.conf.get(key, 'MISSING')}")

In [0]:
gold_files_base_path = f"abfss://{curated_container}@{curated_storage_account}.dfs.core.windows.net/ARIADM/ARM/{segment}/"

silver_base_path = f"abfss://{silver_curated_container}@{curated_storage_account}.dfs.core.windows.net"

## Read in HTML and JSON files

In [0]:
eh_kv_secret = dbutils.secrets.get(scope=keyvault_name, key="RootManageSharedAccessKey")

# Event Hub configurations
eventhubs_hostname = f"ingest{lz_key}-integration-eventHubNamespace001-{env}.servicebus.windows.net:9093"
conf = {
    'bootstrap.servers': eventhubs_hostname,
    'security.protocol': 'SASL_SSL',
    'sasl.mechanism': 'PLAIN',
    'sasl.username': '$ConnectionString',
    'sasl.password': eh_kv_secret,
    'retries': 5,                     # Increased retries
    'enable.idempotence': True,        # Enable idempotent producer
}
broadcast_conf = sc.broadcast(conf)

In [0]:
binary_df = spark.read.format('binaryFile') \
                     .option('pathGlobFilter', '*.{html,json}') \
                     .option('recursiveFileLookup', 'true') \
                     .load(gold_files_base_path)

html_df = binary_df.withColumn("content_str", decode(col('content'), 'utf-8')) \
                   .withColumn('file_path', element_at(split(col('path'), '/'), -1))
html_df = html_df.select('content_str','file_path')

html_df = html_df.withColumn(
    "suffix",
    when(col("file_path").endswith("html"), lit("HTML"))
    .when(col("file_path").endswith("json"), lit("JSON"))
    .when(col("file_path").endswith("a360"), lit("A360"))
).withColumn(
    "blob_url",
    concat(
        lit("https://ingest"),
        lit(lz_key),
        lit("curated"),
        lit(env),
        lit(f".blob.core.windows.net/gold/ARIADM/ARM/{segment}/"),
        upper(col("suffix")), lit("/"),
        col("file_path")
    )
).drop("content_str")

html_df.display()

## Send to EventHubs

In [0]:
# Repartition based on cluster resources
num_spark_partitions =  8
optimized_html_df = html_df.repartition(num_spark_partitions)

In [0]:
def process_partition(partition):
    import logging
    from confluent_kafka import Producer
    from datetime import datetime

    # Initialize logger
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger('KafkaProducer')
    
    failure_list = []
    success_list = []
    results = []

    # Initialize producer
    producer = Producer(**broadcast_conf.value)

    def delivery_report(err, msg):
        key_str = msg.key().decode('utf-8') if msg.key() is not None else "Unknown"
        if err is not None:
            err_msg = str(err)
            logger.error(f"Message delivery failed for key {key_str}: {err}")
            failure_list.append((key_str, "failure", err_msg, datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))
        else:
            success_list.append((key_str, "success", "", datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))

    for row in partition:
        if row.file_path is None or row.blob_url is None:
            logger.warning(f"Skipping row with missing file_path/blob_url: {row}")
            continue

        try:
            value = row.blob_url.encode('utf-8')
            producer.produce(
                topic=f'evh-{segment_short}-pub-{lz_key}-uks-dlrm-01',
                key=row.file_path.encode('utf-8'),
                value=value,
                callback=delivery_report
            )

        except BufferError:
            logger.error("Producer buffer full. Polling for events.")
            producer.poll(1)  
        except Exception as e:
            logger.error(f"Unexpected error during production: {e}")
            failure_list.append((row.file_path, "failure", str(e), datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))

    try:
        producer.flush()
        logger.info("Producer flushed successfully.")
    except Exception as e:
        logger.error(f"Unexpected error during flush: {e}")

    # Append results to list instead of using yield
    results.extend(success_list)
    results.extend(failure_list)

    return results  # Return list instead of using yield

# Schema for result DataFrame
schema = StructType([
    StructField("file_name", StringType(), True),
    StructField("status", StringType(), True),
    StructField("error_message", StringType(), True),
    StructField("timestamp", StringType(), True)
])

# Apply the optimized processing
result_rdd = optimized_html_df.rdd.mapPartitions(process_partition).collect() 

# Create DataFrame and show results
result_df = spark.createDataFrame(result_rdd, schema)
display(result_df)  # Debugging step to verify output

In [0]:
# # Display failed files

failed_files = result_df.filter(col("status") == "failure")

display(failed_files)
failed_files.count()

In [0]:
result_df.write.format("delta").mode("append").save(f"{silver_base_path}/ARIADM/ARM/AUDIT/{segment}/{segment_short}_pub_audit_table")

In [0]:
dbutils.notebook.exit("Notebook completed successfully")