In [0]:
from confluent_kafka import Producer
import json
from  itertools import islice
import numpy as np
from pyspark.sql.functions import col, decode, split, element_at,udf
import logging
from pyspark.sql.types import StructType, StructField, StringType
from pyspark import SparkContext


## Read in HTML and JSON files

In [0]:
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()

print(f"env_code: {lz_key}")  # This won't be redacted
print(f"env_name: {env_name}")  # This won't be redacted

KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
print(f"KeyVault_name: {KeyVault_name}") 

In [0]:
KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"

# Service principal credentials
client_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-TENANT-ID")

# Storage account names
curated_storage = f"ingest{lz_key}curated{env_name}"

# Spark config for curated storage (Delta table)
spark.conf.set(f"fs.azure.account.auth.type.{curated_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{curated_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{curated_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{curated_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{curated_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")


In [0]:
# eh_kv_secret = dbutils.secrets.get(scope=KeyVault_name, key="evh-namespace-connection-string")
eh_kv_secret = dbutils.secrets.get(scope=KeyVault_name, key="RootManageSharedAccessKey")

In [0]:

AppealCategory = "ARIAFPA"

dbutils.widgets.dropdown(
    name='file_types', 
    defaultValue='html,json', 
    choices=[
        'html,json', 
        'a360'
    ]
)

gold_mount = f"abfss://gold@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ARM/APPEALS/ARIA{AppealCategory[-3:].upper()}"
# topic = f"evh-td-pub-dev-uks-dlrm-01"
# topic = f"evh-apl-{AppealCategory[-3:].lower()}-pub-dev-uks-dlrm-01"
topic = f"evh-apl{AppealCategory[-3:].lower()}-pub-{lz_key}-uks-dlrm-01"
# dropzone_mount = f"/mnt/dropzonearia{{AppealCategory[-3:].lower()}}/TD/"
audit_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ARM/AUDIT/APPEALS/{AppealCategory}/apl_{AppealCategory[-3:].lower()}_pub_audit_table"

file_types = dbutils.widgets.get('file_types')

display(f"Gold Mount: {gold_mount}", f"Topic: {topic}", 
        # f"Dropzone Mount: {dropzone_mount}",
         f"File Types: {file_types}", f"Audit Path: {audit_path}")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, element_at
from pyspark.sql.types import StructType, StructField, StringType
from pyspark import SparkContext



# Event Hub configurations
# eventhubs_hostname = "sbox-dlrm-eventhub-ns.servicebus.windows.net:9093"
eventhubs_hostname = f"ingest{lz_key}-integration-eventHubNamespace001-{env_name}.servicebus.windows.net:9093"
conf = {
    'bootstrap.servers': eventhubs_hostname,
    'security.protocol': 'SASL_SSL',
    'sasl.mechanism': 'PLAIN',
    'sasl.username': '$ConnectionString',
    # 'sasl.password': "Endpoint=sb://sbox-dlrm-eventhub-ns.servicebus.windows.net/;SharedAccessKeyName=RootManageSharedAccessKey;SharedAccessKey=" ,
    'sasl.password': eh_kv_secret,
    'retries': 5,                     # Increased retries
    'enable.idempotence': True,        # Enable idempotent producer
}
broadcast_conf = sc.broadcast(conf)



In [0]:
# Read and prepare data HTML files
# json_mount = '/mnt/ingest00curatedsboxgold/ARIADM/ARM/ARIATD/'

from pyspark.sql.functions import concat, lit, when, col, upper

binary_df = spark.read.format('binaryFile') \
                     .option('pathGlobFilter', f'*.{{{file_types}}}') \
                     .option('recursiveFileLookup', 'true') \
                     .load(gold_mount)



html_df = binary_df.withColumn("content_str", decode(col('content'), 'utf-8')) \
                   .withColumn('file_path', element_at(split(col('path'), '/'), -1))
html_df = html_df.select('content_str','file_path')

html_df = html_df.withColumn(
    "suffix",
    when(col("file_path").endswith("html"), lit("HTML"))
    .when(col("file_path").endswith("json"), lit("JSON"))
    .when(col("file_path").endswith("a360"), lit("A360"))
).withColumn(
    "blob_url",
    concat(
        lit("https://ingest"),
        lit(lz_key),
        lit("curated"),
        lit(env_name),
        lit(f".blob.core.windows.net/gold/ARIADM/ARM/APPEALS/ARIA{AppealCategory[-3:].upper()}/"),
        upper(col("suffix")), lit("/"),
        col("file_path")
    )
).drop("content_str")


html_df.display()

## Send to EventHubs

In [0]:
# Repartition based on cluster resources
num_spark_partitions =  8
optimized_html_df = html_df.repartition(num_spark_partitions)

In [0]:
def process_partition(partition):
    import logging
    from confluent_kafka import Producer
    from datetime import datetime

    # Initialize logger
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger('KafkaProducer')
    
    failure_list = []
    success_list = []
    results = []

    # Initialize producer
    producer = Producer(**broadcast_conf.value)

    def delivery_report(err, msg):
        key_str = msg.key().decode('utf-8') if msg.key() is not None else "Unknown"
        if err is not None:
            err_msg = str(err)
            logger.error(f"Message delivery failed for key {key_str}: {err}")
            failure_list.append((key_str, "failure", err_msg, datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))
        else:
            success_list.append((key_str, "success", "", datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))

    for row in partition:
        # Check if blob_url and file_path exist
        if row.file_path is None or row.blob_url is None:
            logger.warning(f"Skipping row with missing file_path/blob_url: {row}")
            continue

        try:
            # Send blob_url as the message value
            value = row.blob_url.encode('utf-8')
            producer.produce(
                topic=topic,
                key=row.file_path.encode('utf-8'),
                value=value,
                callback=delivery_report
            )

        except BufferError:
            logger.error("Producer buffer full. Polling for events.")
            producer.poll(1)
        except Exception as e:
            logger.error(f"Unexpected error during production: {e}")
            failure_list.append((row.file_path, "failure", str(e), datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")))

    try:
        producer.flush()
        logger.info("Producer flushed successfully.")
    except Exception as e:
        logger.error(f"Unexpected error during flush: {e}")

    results.extend(success_list)
    results.extend(failure_list)

    return results  # Return list instead of using yield


# Schema for result DataFrame
schema = StructType([
    StructField("file_name", StringType(), True),
    StructField("status", StringType(), True),
    StructField("error_message", StringType(), True),
    StructField("timestamp", StringType(), True)
])

# Apply optimized processing
result_rdd = optimized_html_df.rdd.mapPartitions(process_partition).collect()

# Create DataFrame and show results
result_df = spark.createDataFrame(result_rdd, schema)
display(result_df)



In [0]:
# # Display failed files

failed_files = result_df.filter(col("status") == "failure")

display(failed_files)


In [0]:
result_df.write.format("delta").mode("append").save(audit_path)

In [0]:
successful_files =  result_df.filter(col("status") == "success").count()
failed_files =  result_df.filter(col("status") == "failure").count()

dbutils.notebook.exit({"successful_files": successful_files, "failed_files": failed_files})

## Appendix

In [0]:
# dbutils.fs.ls("/mnt/dropzoneariatd/ARIATD/submission")

In [0]:
# from pyspark.sql.functions import col, from_unixtime

# files_df = spark.createDataFrame(dbutils.fs.ls(f"{dropzone_mount}submission/"))
# files_df = files_df.withColumn("modificationTime", from_unixtime(col("modificationTime") / 1000).cast("timestamp"))

# display(files_df.orderBy(col("modificationTime").desc()))

In [0]:
# json_count = files_df.filter(col("path").endswith(".json")).count()
# html_count = files_df.filter(col("path").endswith(".html")).count()
# a360_count = files_df.filter(col("path").endswith(".a360")).count()

# display(json_count)
# display(html_count)
# display(a360_count)

In [0]:
# # Read and prepare data HTML files
# t_json_mount = f'{dropzone_mount}response/'
# t_binary_df = spark.read.format('binaryFile') \
#                      .option('pathGlobFilter', '*.rsp') \
#                      .option('recursiveFileLookup', 'true') \
#                      .load(t_json_mount)
 
 
 
# t_html_df = t_binary_df.withColumn("content_str", decode(col('content'), 'utf-8')) \
#                    .withColumn('file_path', element_at(split(col('path'), '/'), -1))
# t_html_df = t_html_df.select('content_str','file_path')
 
# display(t_html_df)

In [0]:
# #To look at response folder
# ##############################################
# # Identify env and KeyVault_name
# config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
# env_name = config.first()["env"].strip().lower()
# lz_key = config.first()["lz_key"].strip().lower()
 
# print(f"env_code: {lz_key}")  # This won't be redacted
# print(f"env_name: {env_name}")  # This won't be redacted
 
# KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
# print(f"KeyVault_name: {KeyVault_name}")
# RECORD_CLASS = 'ARIAFPADEV'
 
# SAS_Token = dbutils.secrets.get(KeyVault_name, f"{RECORD_CLASS}-SAS-TOKEN")
# storage_account_name = "a360c2x2555dz"
# container_name = "dropzone"
# sub_dir = f"{RECORD_CLASS}/submission"
 
# input_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{sub_dir}"
 
# spark.conf.set(
#     f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net",
#     SAS_Token
# )
 
# df = spark.read.format("binaryFile").load(input_path)
# # display(df)
 
# from pyspark.sql.functions import col, from_unixtime, to_timestamp, window, avg, expr
 
# files_df = df.withColumn(
#     "modification_time_ts",
#     to_timestamp(col("modificationTime"))
# )
 
# files_df.count()
 
# agg_df = (
#     files_df.groupBy(window(col("modification_time_ts"), "1 minute"))
#             .agg({"path": "count"})
#             .withColumnRenamed("count(path)", "path_count")
#             .orderBy(col("window").desc())
# )
 
# display(agg_df)
 
# # Calculate the average count per minute
# avg_per_min_df = agg_df.select(avg("path_count").alias("avg_per_min"))
# display(avg_per_min_df)
 
# # Calculate the median count per minute
# median_per_min_df = agg_df.select(expr("percentile_approx(path_count, 0.5)").alias("median_per_min"))
# display(median_per_min_df)
 
# df.count()