In [0]:
from confluent_kafka import Producer
import json
from  itertools import islice
import numpy as np
from pyspark.sql.functions import col, decode, split, element_at, udf, lit, reduce, from_json, to_timestamp
import logging
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import datetime
from pyspark.sql import SparkSession, DataFrame
from pyspark import SparkContext
import os
from functools import reduce
import time

In [0]:
## Assign configs
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()

keyvault_name = f"ingest{lz_key}-meta002-{env}"

In [0]:
# Access the Service Principle secrets from keyvaults
client_secret = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-SECRET')
tenant_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-TENANT-ID')
client_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-ID')

In [0]:
## Paramaterise containers
curated_storage_account = f"ingest{lz_key}curated{env}"
curated_container = "gold"
silver_curated_container = "silver"

In [0]:
curated_storage_account = f"ingest{lz_key}curated{env}"
checkpoint_storage_account = f"ingest{lz_key}xcutting{env}"

##Assign OAuth to curated storage account
storage_accounts = [curated_storage_account, checkpoint_storage_account]

for storage_account in storage_accounts:
    configs = {
            f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net": "OAuth",
            f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net":
                "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
            f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net": client_id,
            f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net": client_secret,
            f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net":
                f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
        }
    for key,val in configs.items():
        spark.conf.set(key,val)

In [0]:
# Print out the auth config for each storage account to confirm
for storage_account in storage_accounts:
    key = f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net"
    print(f"{key}: {spark.conf.get(key, 'MISSING')}")

In [0]:
# Retrieve the state parameter from the Databricks Workflow
dbutils.widgets.text("state", "paymentPending", "State to Process")
state = dbutils.widgets.get("state")
print(f"🔄 Processing state: {state}")

EH_NAMESPACE = f"ingest{lz_key}-integration-eventHubNamespace001-{env}"
EH_NAME = f"evh-active-pub-{env}-{lz_key}-uks-dlrm-01" #To create this Eventhub in the UI

connection_string = dbutils.secrets.get(keyvault_name, "RootManageSharedAccessKey")

KAFKA_OPTIONS = {
    "kafka.bootstrap.servers": f"{EH_NAMESPACE}.servicebus.windows.net:9093",
    "subscribe": EH_NAME,
    "consumer.group.id": state,
    "kafka.security.protocol": "SASL_SSL",
    "failOnDataLoss": "false",
    "startingOffsets": "latest",
    "kafka.sasl.mechanism": "PLAIN",
    "kafka.sasl.jaas.config": f'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="{connection_string}";'
}

In [0]:
# Paths specific to this state
data_path = f"abfss://silver@ingest{lz_key}curated{env}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{current_state}/publish_audit_db_eh/"
checkpoint_path = f"abfss://db-ack-checkpoint@ingest{lz_key}xcutting{env}.dfs.core.windows.net/{current_state}/ACK/"

print(f"📂 Data path: {data_path}")
print(f"📂 Checkpoint path: {checkpoint_path}")

# Keep schema exactly as it exists in Pub notebook
schema = StructType([
    StructField("RunID", StringType(), True),
    StructField("CaseNo", StringType(), True),
    StructField("Filename", StringType(), True),
    StructField("State", StringType(), True),
    StructField("PublishingDateTime", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Error", StringType(), True)
])

#Read data stored in the EH
eventhubdf = (
    spark.readStream.format("kafka")
    .options(**KAFKA_OPTIONS)
    .load()
)

#Parse the Kafka message for values (e.g., schema above)
parsed_df = (
    eventhubdf
    .select(col("value").cast("string").alias("json_str"))
    .select(from_json(col("json_str"), schema).alias("json_obj"))
    .select("json_obj.*")
)

#Stream data into publish_audit_db_eh. Checkpoing into xcutting
query = (
    parsed_df.writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_path)
    .outputMode("append")
    .start(data_path)
)

#Wait 15seconds if no data has been received then complete
query.awaitTermination(timeout=15)
query.stop()

#Read df 
df = (
    spark.read.format("delta")
    .load(data_path)
    .filter(col("Status").isNotNull())
)

display(df)
#Return total number of rows per state
display(df.groupBy("State").count())

In [0]:
dbutils.notebook.exit(f"{current_state} notebook completed successfully")