In [0]:
from azure.storage.blob import BlobServiceClient
from pyspark.sql.functions import col, decode, split, element_at,udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, LongType
from pyspark.sql.functions import date_format, col, when, count, lit, concat, round as spark_round
import time
import logging

In [0]:
logger = logging.getLogger("DatabricksWorkflow")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)

In [0]:
#Load configuration JSON
config_path = "dbfs:/configs/config.json"
try:
    config = spark.read.option("multiline", "true").json(config_path)
    logger.info(f"Successfully read config file from {config_path}")
except Exception as e:
    logger.error(f"Could not read config file at {config_path}: {e}", exc_info=True)
    raise FileNotFoundError(f"Could not read config file at {config_path}: {e}")

#Extract environment and lz_key
try:
    first_row = config.first()
    env = first_row["env"].strip().lower()
    lz_key = first_row["lz_key"].strip().lower()
    logger.info(f"Extracted configs: env={env}, lz_key={lz_key}")
except Exception as e:
    logger.error(f"Missing expected keys 'env' or 'lz_key' in config file: {e}", exc_info=True)
    raise KeyError(f"Missing expected keys 'env' or 'lz_key' in config file: {e}")

#Construct keyvault name
try:
    keyvault_name = f"ingest{lz_key}-meta002-{env}"
    logger.info(f"Constructed keyvault name: {keyvault_name}")
except Exception as e:
    logger.error(f"Error constructing keyvault name: {e}", exc_info=True)
    raise ValueError(f"Error constructing keyvault name: {e}")


In [0]:
# Access the Service Principal secrets from Key Vault
try:
    client_secret = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-SECRET')
    logger.info("Successfully retrieved SERVICE-PRINCIPLE-CLIENT-SECRET from Key Vault")
except Exception as e:
    logger.error(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-SECRET' from Key Vault '{keyvault_name}': {e}", exc_info=True)
    raise KeyError(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-SECRET' from Key Vault '{keyvault_name}': {e}")

try:
    tenant_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-TENANT-ID')
    logger.info("Successfully retrieved SERVICE-PRINCIPLE-TENANT-ID from Key Vault")
except Exception as e:
    logger.error(f"Could not retrieve 'SERVICE-PRINCIPLE-TENANT-ID' from Key Vault '{keyvault_name}': {e}", exc_info=True)
    raise KeyError(f"Could not retrieve 'SERVICE-PRINCIPLE-TENANT-ID' from Key Vault '{keyvault_name}': {e}")

try:
    client_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-ID')
    logger.info("Successfully retrieved SERVICE-PRINCIPLE-CLIENT-ID from Key Vault")
except Exception as e:
    logger.error(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-ID' from Key Vault '{keyvault_name}': {e}", exc_info=True)
    raise KeyError(f"Could not retrieve 'SERVICE-PRINCIPLE-CLIENT-ID' from Key Vault '{keyvault_name}': {e}")

logger.info("✅ Successfully retrieved all Service Principal secrets from Key Vault")


In [0]:
# --- Parameterise containers ---
curated_storage_account = f"ingest{lz_key}curated{env}"
curated_container = "gold"
silver_curated_container = "silver"
checkpoint_storage_account = f"ingest{lz_key}xcutting{env}"

# --- Assign OAuth to storage accounts ---
storage_accounts = [curated_storage_account, checkpoint_storage_account]

for storage_account in storage_accounts:
    try:
        configs = {
            f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net": "OAuth",
            f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net":
                "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
            f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net": client_id,
            f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net": client_secret,
            f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net":
                f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
        }

        for key, val in configs.items():
            try:
                spark.conf.set(key, val)
            except Exception as e:
                logger.error(f"Failed to set Spark config '{key}' for storage account '{storage_account}': {e}", exc_info=True)
                raise RuntimeError(f"Failed to set Spark config '{key}' for storage account '{storage_account}': {e}")

        logger.info(f"✅ Successfully configured OAuth for storage account: {storage_account}")

    except Exception as e:
        logger.error(f"Error configuring OAuth for storage account '{storage_account}': {e}", exc_info=True)
        raise RuntimeError(f"Error configuring OAuth for storage account '{storage_account}': {e}")


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, LongType

expected_schema = StructType([
    StructField("operation", StringType(), True),
    StructField("transaction_id", StringType(), True),
    StructField("relation_id", StringType(), True),
    StructField("a360_record_id", StringType(), True),
    StructField("process_time", TimestampType(), True),
    StructField("status", IntegerType(), True),
    StructField("input", StringType(), True),  # Contains nested JSON as a string
    StructField("exception_description", StringType(), True),
    StructField("error_status", StringType(), True),
    StructField("a360_file_id", StringType(), True),
    StructField("file_size", LongType(), True),
    StructField("s_md5", StringType(), True),
    StructField("s_sha256", StringType(), True),
    StructField("timestamp", TimestampType(), True),  # may be used as process_time
    StructField("filename", StringType(), True),
    StructField("submission_folder", StringType(), True),
    StructField("file_hash", StringType(), True)
])

In [0]:
ARM_segment = "SBDEV" if env == "sbox" else "SB"
ARIA_segment = "SBAILS"

In [0]:
#### Set up Auto Loader job
from pyspark.sql.functions import input_file_name,regexp_extract,col,expr

sas_token = dbutils.secrets.get(scope=f"ingest{lz_key}-meta002-{env}", key=f"ARIA{ARM_segment}-SAS-TOKEN")
storage_account_name = "a360c2x2555dz"
container_name = "dropzone"
sub_dir = f"ARIA{ARM_segment}/response"

input_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{sub_dir}"

spark.conf.set(
    f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net",
    sas_token
)

schema_location = f"/mnt/autoLoaderSchema/ARM{ARIA_segment}/response/read_stream"

# Define a file regex that matches files ending with .rsp
file_regex = ".*\\.rsp$"

output_container_name = "silver" 

output_storage_account_name = f"ingest{lz_key}curated{env}"

output_subdir_amalgamated_responses = f"ARIADM/ARM/response/{ARIA_segment}/amalgamated_responses"
amalgamated_responses_path = f"abfss://{output_container_name}@{output_storage_account_name}.dfs.core.windows.net/{output_subdir_amalgamated_responses}"

output_subdir_input_upload = f"ARIADM/ARM/response/{ARIA_segment}/input_upload"
input_upload_responses_path = f"abfss://{output_container_name}@{output_storage_account_name}.dfs.core.windows.net/{output_subdir_input_upload}"


output_subdir_create_record_upload = f"ARIADM/ARM/response/{ARIA_segment}/create_record"
create_record_responses_path = f"abfss://{output_container_name}@{output_storage_account_name}.dfs.core.windows.net/{output_subdir_create_record_upload}"


output_subdir_upload_file_upload = f"ARIADM/ARM/response/{ARIA_segment}/upload_file"
upload_file_responses_path = f"abfss://{output_container_name}@{output_storage_account_name}.dfs.core.windows.net/{output_subdir_upload_file_upload}"

In [0]:
check_point_path = f"abfss://db-rsp-checkpoint@ingest{lz_key}xcutting{env}.dfs.core.windows.net/ARM{ARIA_segment}/RSP/"
schema_location = f"abfss://db-rsp-checkpoint@ingest{lz_key}xcutting{env}.dfs.core.windows.net/ARM{ARIA_segment}/RSP/schema"

In [0]:
## expected counts
silver_path = f"abfss://silver@ingest{lz_key}curated{env}.dfs.core.windows.net/ARIADM/ARM/AUDIT/{ARIA_segment}/sbl_ack_audit_table"

filtered_df = spark.read.format("delta").load(silver_path)\
    .filter(col("http_response")==201).select(col("filename")).distinct()

html_count = filtered_df.filter(col("filename").contains("html")).count()
json_count = filtered_df.filter(col("filename").contains("json")).count()
a360_count = filtered_df.filter(col("filename").contains("a360")).count()

expected_created_records = html_count

expected_input_upload = a360_count
expected_upload_file = html_count + json_count

logger.info(f"""
expected_created_records = {expected_created_records}
expected_input_upload = {expected_input_upload}
expected_upload_file = {expected_upload_file}
""")


#Stream Responses

In [0]:
### Run autoloader read stream

df = (spark.readStream.format("cloudFiles")
      .schema(expected_schema)
    .option("cloudFiles.format", "json")
    .option("cloudFiles.schemaLocation", schema_location)
    .option("multiline", "true")
    .option("cloudFiles.schemaEvolutionMode", "none")
    .option("checkpointLocation", f"{check_point_path}/rsp_readStream")
    .load(input_path)
    .select( "*",col("_metadata.file_path").alias("_file_path"),
             col("_metadata.file_modification_time").alias("_file_modification_time")
    )
    .withColumn("id", expr("uuid()")))

In [0]:
df_am = df.filter(col("operation").isNotNull())
df_cr = df.filter(col("operation")=="create_record")
df_iu = df.filter(col("operation")=="input_upload")
df_uf = df.filter(col("operation")=="upload_new_file")

In [0]:
df_complete = df_am.writeStream \
    .format("delta") \
    .option("checkpointLocation", f"{check_point_path}/amalgamated") \
    .outputMode("append") \
    .option("mergeSchema", "true") \
    .queryName("amalgamated") \
    .start(amalgamated_responses_path)

In [0]:
### Input upload table

df_input_upload_query = df_iu.select("id","operation","timestamp","status","exception_description","error_status","filename","submission_folder","file_hash","_file_path","_file_modification_time"
                ).writeStream \
                .format("delta") \
                .option("checkpointLocation", f"{check_point_path}/input_upload") \
                .outputMode("append") \
                .queryName("input_upload") \
                .start(input_upload_responses_path) 

In [0]:
## Create record table
df_create_record_query = df_cr.select("id","operation","transaction_id","relation_id","a360_record_id","process_time","status","input","exception_description","error_status","_file_path","_file_modification_time"
    ).writeStream \
    .format("delta") \
    .option("checkpointLocation", f"{check_point_path}/create_record") \
    .outputMode("append") \
    .queryName("create_record") \
    .start(create_record_responses_path)

In [0]:
### upload file table
df_upload_file_query = df_uf.select("id","operation","transaction_id","relation_id","a360_record_id","process_time","status","input","exception_description","error_status","a360_file_id","file_size","s_md5","s_sha256","_file_path","_file_modification_time")\
                .writeStream \
                .format("delta") \
                .option("checkpointLocation", f"{check_point_path}/response/upload_file") \
                .outputMode("append") \
                .queryName("upload_file") \
                .start(upload_file_responses_path)

In [0]:
def get_current_counts():
    df_amalgamated_responses = spark.read.format("delta").load(amalgamated_responses_path)
    input_upload_count = df_amalgamated_responses.filter(col("operation") == "input_upload").count()
    create_record_count = df_amalgamated_responses.filter(col("operation") == "create_record").count()
    upload_file_count = df_amalgamated_responses.filter(col("operation") == "upload_new_file").count()
    return input_upload_count, create_record_count, upload_file_count

In [0]:
# wait 60 seconds before the reconcilliaiton checks
max_attempt = 5
delay = 30 # seconds

for attempt in range(1,max_attempt):
    try:
        input_upload_count, create_record_count, upload_file_count = get_current_counts()
    except Exception as e:
        if attempt < max_attempt:
            print(f"Attempt {attempt} failed: {e}. Retrying in {delay} seconds... ")
            time.sleep(60)
        else:
            print("Failed to get processed counts after {max_attempt} attempts: {e}")
            raise

In [0]:
for q in spark.streams.active:
    logger.info(q.name)

In [0]:
df_amalgamated_output = spark.read.format("delta").load(amalgamated_responses_path)
df_amalgamated_output.createOrReplaceTempView("sbails_amalgamated_response_data")

spark.sql("""
        SELECT 
        *
FROM sbails_amalgamated_response_data
""").display()

In [0]:
df_input_upload = spark.read.format("delta").load(input_upload_responses_path)
df_input_upload.createOrReplaceTempView("sbails_input_upload")

spark.sql(f"""
SELECT  DATE_FORMAT(timestamp, 'ddMMMyyyy') as date,
        operation,
        {expected_input_upload} as expected_input_upload,
        COUNT(CASE WHEN status = 1 THEN 1 END) as count_successful_input_upload,
        COUNT(CASE WHEN status != 1 THEN 1 END) as count_unsuccessful_input_upload,
        concat(((count_successful_input_upload/expected_input_upload) * 100), "%") as `%_of_successful_input_upload`

FROM sbails_input_upload
GROUP BY date, operation
ORDER BY date DESC
""").display()

In [0]:
df_upload_file = spark.read.format("delta").load(upload_file_responses_path)
df_upload_file.createOrReplaceTempView("sbails_upload_file")

spark.sql(f"""
SELECT  DATE_FORMAT(process_time, 'ddMMMyyyy') as date,
        operation,
        {expected_upload_file} as expected_upload_file,
        COUNT(CASE WHEN status = 1 THEN 1 END) as count_successful_upload_file,
        COUNT(CASE WHEN status != 1 THEN 1 END) as count_unsuccessful_upload_file,
        concat(((count_successful_upload_file/expected_upload_file) * 100), "%") as `%_of_successful_upload_file`


FROM sbails_upload_file
GROUP BY date, operation
ORDER BY date DESC
""").display()

In [0]:
df_create_record = spark.read.format("delta").load(create_record_responses_path)
df_create_record.createOrReplaceTempView("sbails_create_record")

spark.sql(f"""
SELECT 
    DATE_FORMAT(process_time, 'ddMMMyyyy') as date,
    operation,
    {expected_created_records} as expected_created_records,
    COUNT(CASE WHEN status = 1 THEN 1 END) as count_of_successful_created_records,
    COUNT(CASE WHEN status != 1 THEN 1 END) as count_of_unsuccessful_created_records,
    CONCAT(ROUND((COUNT(CASE WHEN status = 1 THEN 1 END) * 100.0 / {expected_created_records}), 2), '%') as `%_of_successful_created_records`
FROM sbails_create_record
GROUP BY date, operation
ORDER BY date DESC
""").display()

In [0]:
while any(q.isActive for q in [df_input_upload_query,df_create_record_query,df_upload_file_query]):
    input_upload_count, create_record_count, upload_file_count = get_current_counts()
    logger.info(f"""Current counts:\n
                    input_upload_count={input_upload_count}/{expected_input_upload}\n
                    create_record_count={create_record_count}/{expected_created_records}\n
                    upload_file_count={upload_file_count}/{expected_upload_file}""")
    if input_upload_count >= expected_input_upload:
        df_input_upload_query.stop()
        logger.info(f"Stopping input_upload stream")
    if create_record_count is not None and create_record_count >= expected_created_records:
        df_create_record_query.stop()
        logger.info(f"Stopping create_record stream")
    if upload_file_count >= expected_upload_file:
        df_upload_file_query.stop()
        logger.info(f"Stopping upload_file stream")
    time.sleep(5)

if any([q.isActive for q in spark.streams.active]):
    df_complete.stop()
    logger.info("All streams have been stopped")

In [0]:
df_input_upload = spark.read.format("delta").load(input_upload_responses_path)
 
# # Read the response data for create_record_upload
df_create_record_upload = spark.read.format("delta").load(create_record_responses_path)
 
# # Read the response data for upload_file
df_upload_file_upload = spark.read.format("delta").load(upload_file_responses_path) 
 
# Read the response data for df_amalgamated_responses
df_amalgamated_responses = spark.read.format("delta").load(amalgamated_responses_path)

logger.info(f"Records in df_input_upload: {df_input_upload.count()}")
logger.info(f"Records in df_create_record_upload: {df_create_record_upload.count()}")
logger.info(f"Records in df_upload_file_upload: {df_upload_file_upload.count()}")
logger.info(f"Records in df_amalgamated_responses: {df_amalgamated_responses.count()}")

In [0]:
 dbutils.notebook.exit("Notebook completed successfully")