# ARM Acknowledgment 


**Autoloader set up**  
This Notebook sets up an Autoloader job that runs on a manual trigger to collect ack messages from the ack eventhubs


In [0]:
import time
time.sleep(300) # Await for 5 minutes

In [0]:
import json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, LongType
from pyspark.sql.functions import col,from_json
import json

In [0]:
ack_schema = StructType([
    StructField("filename", StringType(), True),
    StructField("http_response", IntegerType(),True),
    StructField("timestamp", TimestampType(), True),
    StructField("http_message", StringType(), True)
])

In [0]:
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()

print(f"env_code: {lz_key}")  # This won't be redacted
print(f"env_name: {env_name}")  # This won't be redacted

KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
print(f"KeyVault_name: {KeyVault_name}") 

In [0]:

EH_NAMESPACE = f"ingest{lz_key}-integration-eventHubNamespace001-{env_name}"
EH_NAME = f"evh-td-ack-{lz_key}-uks-dlrm-01"

In [0]:
## set up the configuration to allow the autoloader to connect to the source system

## Eventhub details
# connection_string = dbutils.secrets.get("ingest00-meta002-sbox", "evh-joh-ack-dev-uks-dlrm-01-key")
connection_string = dbutils.secrets.get(KeyVault_name, "RootManageSharedAccessKey")
# Encrypt the connection string using the EventHubsUtils.encrypt method
# encrypted_conn_str = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connection_string)


# eventhubdf = spark.readStream.format("eventhubs")\
#     .options(**ehConf)\
#         .load()

# eventhubdf.display()

In [0]:
KAFKA_OPTIONS = {
    "kafka.bootstrap.servers": f"{EH_NAMESPACE}.servicebus.windows.net:9093",
    "subscribe": EH_NAME,
    "startingOffsets": "earliest",
    "kafka.security.protocol": "SASL_SSL",
    "failOnDataLoss": "false",
    "kafka.sasl.mechanism": "PLAIN",
    "kafka.sasl.jaas.config": f'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="{connection_string}";'
}

In [0]:

# Service principal credentials
client_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-TENANT-ID")

# Storage account names
curated_storage = f"ingest{lz_key}curated{env_name}"
checkpoint_storage = f"ingest{lz_key}xcutting{env_name}"

# Spark config for curated storage (Delta table)
spark.conf.set(f"fs.azure.account.auth.type.{curated_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{curated_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{curated_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{curated_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{curated_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{checkpoint_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{checkpoint_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{checkpoint_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{checkpoint_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{checkpoint_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")


In [0]:
# Container and path for storing Delta table (in curated storage account)
data_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ARM/AUDIT/TD/td_ack_audit_table"

# Container and path for checkpoint (in xcuttings storage account)
checkpoint_path = f"abfss://db-ack-checkpoint@ingest{lz_key}xcutting{env_name}.dfs.core.windows.net/ARMTD/ACK/ack"

In [0]:
eventhubdf = spark.readStream.format("kafka")\
    .options(**KAFKA_OPTIONS)\
        .load()

In [0]:
# display(eventhubdf)

In [0]:
parsed_df = (
    eventhubdf
    # 'body' is binary, so we cast to string (assuming UTF-8)
    .select(col("value").cast("string").alias("json_str"))
    .select(from_json(col("json_str"), ack_schema).alias("json_obj"))
    .select("json_obj.*")
)


# parsed_df.display()

In [0]:
query = parsed_df.writeStream \
    .format("delta") \
    .option("checkpointLocation", checkpoint_path) \
    .outputMode("append") \
    .start(data_path)

time.sleep(30)

query.stop()

In [0]:
df = spark.read.format("delta").load(data_path)
display(df)

In [0]:
dbutils.notebook.exit("Notebook execution completed successfully.")

## Appendix

In [0]:

# # Container and path for storing Delta table (in curated storage account)
# data_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ARM/AUDIT/TD/td_ack_audit"

# df = spark.read.format("delta").load(data_path)

# df.count()

# from pyspark.sql.functions import window

# display(
#     df.groupBy(window("timestamp", "1 minute"))
#       .count()
#       .orderBy(col("window").desc())
# )

In [0]:
# dbutils.fs.ls("/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/ARIATD")

In [0]:
# dbutils.fs.ls("/mnt/ingest00curatedsboxsilver/ARIADM/")

In [0]:
# dbutils.fs.ls("/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT")

In [0]:
# dbutils.fs.rm("/mnt/autoLoaderSchema/ARMTD/ACK/ack", recurse=True)

In [0]:
# display(spark.read.format("delta").load("/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/ARIATD/td_ack_audit"))

In [0]:
# dbutils.fs.ls("/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/ARIATD/td_ack_audit/")

In [0]:
# spark.read.format("delta").load("/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/ARIATD/td_ack_audit").display()

In [0]:
# dbutils.fs.ls("/mnt/autoLoaderSchema/ARMTD/ACK/ack")

In [0]:
# dbutils.fs.rm("/mnt/autoLoaderSchema/ARMTD/ACK/ack", True)
# dbutils.fs.rm("/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/ARIATD/", True)

In [0]:
# # Define the paths to the tables
# audit_delta_path = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/JOH/joh_cr_audit_table"
# ack_path = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/JOH/joh_ack_audit"
# output_subdir_input_upload = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/JOH/input_upload"
# output_subdir_create_record_upload = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/JOH/create_record"
# output_subdir_upload_file_upload = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/JOH/upload_file"
# output_subdir_amalgamated_responses = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/JOH/amalgamated_responses"

# # Read the Delta table for joh_cr_audit_table
# df_audit = spark.read.format("delta").load(audit_delta_path)

# # Read the Delta table for joh_ack_audit
# df_ack = spark.read.format("delta").load(ack_path)

# # Read the response data for input_upload
# # df_input_upload = spark.read.format("parquet").load(output_subdir_input_upload)

# # # Read the response data for create_record_upload
# # df_create_record_upload = spark.read.format("parquet").load(output_subdir_create_record_upload)

# # # Read the response data for upload_file
# # df_upload_file_upload = spark.read.format("parquet").load(output_subdir_upload_file_upload)


# # Read the response data for df_amalgamated_responses
# # df_amalgamated_responses = spark.read.format("delta").load(output_subdir_amalgamated_responses)

# # Display the DataFrames using Databricks display
# display(df_audit)
# display(df_ack)
# # display(df_input_upload)
# # display(df_create_record_upload)
# # display(df_upload_file_upload)
# # display(df_amalgamated_responses)

In [0]:

# sas_token = dbutils.secrets.get(scope="ingest00-meta002-sbox", key="ARIAJR-SAS-TOKEN")
# storage_account_name = "a360c2x2555dz"
# container_name = "dropzone"
# sub_dir = "ARIAJR/response"

In [0]:
# # Define the paths to the tables
# audit_delta_path = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/JOH/joh_cr_audit_table"
# ack_path = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/JOH/joh_ack_audit"
# output_subdir_input_upload = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/JOH/input_upload"
# output_subdir_create_record_upload = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/JOH/create_record"
# output_subdir_upload_file_upload = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/JOH/upload_file"
# output_subdir_amalgamated_responses = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/JOH/amalgamated_responses"

# # Read the Delta table for joh_cr_audit_table
# df_audit = spark.read.format("delta").load(audit_delta_path)

# # Read the Delta table for joh_ack_audit
# df_ack = spark.read.format("delta").load(ack_path)

# # Read the response data for input_upload
# # df_input_upload = spark.read.format("parquet").load(output_subdir_input_upload)

# # # Read the response data for create_record_upload
# # df_create_record_upload = spark.read.format("parquet").load(output_subdir_create_record_upload)

# # # Read the response data for upload_file
# # df_upload_file_upload = spark.read.format("parquet").load(output_subdir_upload_file_upload)


# # Read the response data for df_amalgamated_responses
# # df_amalgamated_responses = spark.read.format("delta").load(output_subdir_amalgamated_responses)

# # Display the DataFrames using Databricks display
# display(df_audit)
# display(df_ack)
# # display(df_input_upload)
# # display(df_create_record_upload)
# # display(df_upload_file_upload)
# # display(df_amalgamated_responses)