# ARM Acknowledgment 


**Autoloader set up**  
This Notebook sets up an Autoloader job that runs on a manual trigger to collect ack messages from the ack eventhubs


In [0]:
import json
import time
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, LongType
from pyspark.sql.functions import col,from_json
import json

In [0]:
ack_schema = StructType([
    StructField("file_name", StringType(), True),
    StructField("state", StringType(), True),
    StructField("status", StringType(), True),
    StructField("error_message", StringType(), True),
    StructField("timestamp", StringType(), True)
])

## Set up configs

In [0]:
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()

keyvault_name = f"ingest{lz_key}-meta002-{env}"

In [0]:
# Access the Service Principle secrets from keyvaults
client_secret = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-SECRET')
tenant_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-TENANT-ID')
client_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-ID')

In [0]:
EH_NAMESPACE = f"ingest{lz_key}-integration-eventHubNamespace001-{env}"
EH_NAME = f"evh-active-pub-{env}-{lz_key}-uks-dlrm-01" #To create this Eventhub in the UI

connection_string = dbutils.secrets.get(keyvault_name, "RootManageSharedAccessKey")

In [0]:

KAFKA_OPTIONS = {
    "kafka.bootstrap.servers": f"{EH_NAMESPACE}.servicebus.windows.net:9093",
    "subscribe": EH_NAME,
    "consumer.group.id": "active",
    # "startingOffsets": "earliest",
    "kafka.security.protocol": "SASL_SSL",
    "failOnDataLoss": "false",
    "startingOffsets": "latest",
    "kafka.sasl.mechanism": "PLAIN",
    "kafka.sasl.jaas.config": f'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="{connection_string}";'
}

In [0]:
# Lets loop here 
curated_storage_account = f"ingest{lz_key}curated{env}"
checkpoint_storage_account = f"ingest{lz_key}xcutting{env}"

##Assign OAuth to curated storage account
storage_accounts = [curated_storage_account, checkpoint_storage_account]

for storage_account in storage_accounts:
    configs = {
            f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net": "OAuth",
            f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net":
                "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
            f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net": client_id,
            f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net": client_secret,
            f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net":
                f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
        }
    for key,val in configs.items():
        spark.conf.set(key,val)

In [0]:
# from pyspark.sql.functions import col, from_json
# import time

# states = [
#     "paymentPending", 
#     "appealSubmitted", 
#     "awaitingRespondentEvidence(a)", 
#     "awaitingRespondentEvidence(b)", 
#     "caseUnderReview", 
#     "reasonForAppealSubmitted", 
#     "listing",
#     "PrepareForHearing",
#     "Decision",
#     "FTPA Submitted (a)",
#     "FTPA Submitted (b)",
#     "Decided (b)",
#     "Decided (a)",
#     "FTPA Decided",
#     "Ended",
#     "Remitted"
# ]

# ## Map each state to a Kafka partition (if needed downstream)
# state_partition_map = {state: i for i, state in enumerate(states)}

# for state in states:
#     print(f"Starting stream for state: {state}")
#     try:
#         data_path = f"abfss://silver@ingest{lz_key}curated{env}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{state}/publish_audit_db_eh/"
#         checkpoint_path = f"abfss://db-ack-checkpoint@ingest{lz_key}xcutting{env}.dfs.core.windows.net/{state}/ACK/"

#         KAFKA_OPTIONS = {
#             "kafka.bootstrap.servers": f"{EH_NAMESPACE}.servicebus.windows.net:9093",
#             "subscribe": EH_NAME,
#             "kafka.group.id": state,   # consumer group tied to this state
#             "kafka.security.protocol": "SASL_SSL",
#             "failOnDataLoss": "false",
#             "startingOffsets": "latest",
#             "kafka.sasl.mechanism": "PLAIN",
#             "kafka.sasl.jaas.config": f'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="{connection_string}";'
#         }

#         # Start Kafka job
#         eventhubdf = spark.readStream.format("kafka")\
#             .options(**KAFKA_OPTIONS)\
#             .load()

#         parsed_df = (
#             eventhubdf
#             .select(col("value").cast("string").alias("json_str"))
#             .select(from_json(col("json_str"), ack_schema).alias("json_obj"))
#             .select("json_obj.*")
#         )

#         query = parsed_df.writeStream \
#             .format("delta") \
#             .option("checkpointLocation", checkpoint_path) \
#             .outputMode("append") \
#             .start(data_path)

#         # Wait some time for this state to ingest data
#         time.sleep(30)
#         query.stop()

#         df = spark.read.format("delta").load(data_path)
#         print(f"Completed writing for {state}, total rows: {df.count()}")
#         display(df)

#     except Exception as e:
#         print(f"Skipping state {state} due to error: {str(e)}")
#         continue


In [0]:
from pyspark.sql.functions import col, from_json
import time

states = [
    "paymentPending", 
    "appealSubmitted", 
    "awaitingRespondentEvidence(a)", 
    "awaitingRespondentEvidence(b)", 
    "caseUnderReview", 
    "reasonForAppealSubmitted", 
    "listing",
    "PrepareForHearing",
    "Decision",
    "FTPA Submitted (a)",
    "FTPA Submitted (b)",
    "Decided (b)",
    "Decided (a)",
    "FTPA Decided",
    "Ended",
    "Remitted"
]

##Create a 1:1 map between partition and consumer group
# state_partition_map = {state: i for i, state in enumerate(states)}

KAFKA_OPTIONS = {
"kafka.bootstrap.servers": f"{EH_NAMESPACE}.servicebus.windows.net:9093",
"subscribe": EH_NAME,
"kafka.group.id": state,
"kafka.security.protocol": "SASL_SSL",
"failOnDataLoss": "false",
"startingOffsets": "latest",
"kafka.sasl.mechanism": "PLAIN",
"kafka.sasl.jaas.config": f'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="{connection_string}";'
}


state_paths = []
checkpoint_paths = []

for state in states:

    if state == "paymentPending":

        data_path = f"abfss://silver@ingest{lz_key}curated{env}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{state}/publish_audit_db_eh/"
        checkpoint_path = f"abfss://db-ack-checkpoint@ingest{lz_key}xcutting{env}.dfs.core.windows.net/{state}/ACK/"

        state_paths.append(data_path)
        checkpoint_paths.append(checkpoint_path)

        ## Start Kafka job to consume/read rows from the EventHub
        eventhubdf = spark.readStream.format("kafka")\
            .options(**KAFKA_OPTIONS)\
            .load()

        ## Select columns of interest from ack_schema that is streamed from the EventHub
        parsed_df = (
            eventhubdf
            # 'body' is binary, so we cast to string (assuming UTF-8)
            .select(col("value").cast("string").alias("json_str"))
            .select(from_json(col("json_str"), ack_schema).alias("json_obj"))
            .select("json_obj.*")
        )

        ## Write the stream to the relevant state checkpoint/data path one state at a time
        query = parsed_df.writeStream \
            .format("delta") \
            .option("checkpointLocation", checkpoint_path) \
            .outputMode("append") \
            .start(data_path)

        ## Wait 30 seconds to write all data to relevant location then stop the stream
        time.sleep(30)
        query.stop()

        df = spark.read.format("delta") \
        .load(data_path) 
        # .filter(col("status").isNotNull()) #86 non null. 1720 total records. to query
        display(df)

In [0]:
dbutils.notebook.exit("Notebook completed successfully")

In [0]:
# Delete the entire APPEALS directory structure
dbutils.fs.rm("abfss://silver@ingest00curatedsbox.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/", True)

# Verify it's gone
try:
    dbutils.fs.ls("abfss://silver@ingest00curatedsbox.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/")
    print("APPEALS directory still exists")
except:
    print("APPEALS directory successfully deleted - clean slate!")