In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, LongType
from pyspark.sql.functions import *


expected_schema = StructType([
    StructField("operation", StringType(), True),
    StructField("transaction_id", StringType(), True),
    StructField("relation_id", StringType(), True),
    StructField("a360_record_id", StringType(), True),
    StructField("process_time", TimestampType(), True),
    StructField("status", IntegerType(), True),
    StructField("input", StringType(), True),  # Contains nested JSON as a string
    StructField("exception_description", StringType(), True),
    StructField("error_status", StringType(), True),
    StructField("a360_file_id", StringType(), True),
    StructField("file_size", LongType(), True),
    StructField("s_md5", StringType(), True),
    StructField("s_sha256", StringType(), True),

    StructField("timestamp", TimestampType(), True),  # may be used as process_time
    StructField("filename", StringType(), True),
    StructField("submission_folder", StringType(), True),
    StructField("file_hash", StringType(), True)
])
 

In [0]:
#### Set up Auto Loader job
sas_token = dbutils.secrets.get(scope="ingest00-meta002-sbox", key="ARIAFTA-SAS-TOKEN")
storage_account_name = "a360c2x2555dz"
container_name = "dropzone"
sub_dir = "ARIAFTA/response"

input_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{sub_dir}"

spark.conf.set(
    f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net",
    sas_token
)

schema_location = "/mnt/autoLoaderSchema/ARIAFTA/response/read_stream"

# Define a file regex that matches files ending with .rsp
file_regex = ".*\\.rsp$"
output_container_name = "silver" 
output_storage_account_name = "ingest00curatedsbox"
output_subdir_amalgamated_responses = "ARIADM/ARM/response/ARIAFTA/amalgamated_responses"
amalgamated_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_amalgamated_responses}"

output_subdir_input_upload = "ARIADM/ARM/response/ARIAFTA/input_upload"
input_upload_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_input_upload}"

output_subdir_create_record_upload = "ARIADM/ARM/response/ARIAFTA/create_record"
create_record_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_create_record_upload}"

output_subdir_upload_file_upload = "ARIADM/ARM/response/ARIAFTA/upload_file"
upload_file_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_upload_file_upload}"

In [0]:
### Run Autoloader Read Stream
df = (spark.readStream
    .format("cloudFiles")
    .schema(expected_schema)
    .option("cloudFiles.format", "json")
    .option("cloudFiles.schemaLocation", schema_location)  # Schema tracking
    .option("multiline", "true")
    .option("cloudFiles.schemaEvolutionMode", "none")
    .load(input_path)
    .select(
        "*",
        col("_metadata.file_path").alias("_file_path"),
        col("_metadata.file_modification_time").alias("_file_modification_time")
    )
    .withColumn("id", expr("uuid()"))  # Generates UUID correctly
)

# display(df)



In [0]:
### 4 write streams to Silver container in blob storage

output_sas = dbutils.secrets.get(scope="ingest00-meta002-sbox", key="CURATED-SAS-TOKEN")

spark.conf.set(
    f"fs.azure.sas.{output_container_name}.{output_storage_account_name}.blob.core.windows.net",
    output_sas
)

### first save the complete table

df_amalgamated = df.writeStream \
    .format("delta")\
        .option("checkpointLocation", "/mnt/autoLoaderSchema/ARIAFTA/response/amalgamated")\
            .outputMode("append")\
                .trigger(availableNow=True)\
                    .start(amalgamated_responses_path)

In [0]:
### Create Record table
df_create_upload = df.filter(col("operation").isin(["create_record"])).select("id","operation","transaction_id","relation_id","a360_record_id","process_time","status","input","exception_description","error_status","_file_path","_file_modification_time") \
    .writeStream \
    .format("delta")\
        .option("checkpointLocation", "/mnt/autoLoaderSchema/ARMAPPEALS/response/create_record")\
            .outputMode("append")\
                .trigger(availableNow=True)\
                .start(create_record_responses_path)

In [0]:
### upload file table
df_upload = df.filter(col("operation").isin(["upload_new_file"])).select("id","operation","transaction_id","relation_id","a360_record_id","process_time","status","input","exception_description","error_status","a360_file_id","file_size","s_md5","s_sha256","_file_path","_file_modification_time"
)\
    .writeStream \
    .format("delta")\
        .option("checkpointLocation", "/mnt/autoLoaderSchema/ARMAPPEALS/response/upload_file")\
            .outputMode("append")\
                .trigger(availableNow=True)\
                .start(upload_file_responses_path)

In [0]:
### Input upload table

df_input_upload = df.filter(col("operation").isin(["input_upload"])).select("id","operation","timestamp","status","exception_description","error_status","filename","submission_folder","file_hash","_file_path","_file_modification_time"
)\
    .writeStream \
    .format("delta")\
        .option("checkpointLocation", "/mnt/autoLoaderSchema/ARMAPPEALS/response/input_upload")\
            .outputMode("append")\
                .trigger(availableNow=True)\
                .start(input_upload_responses_path)

In [None]:
# Wait 15 minutes for any stream to stop itself, or auto stop after 15mins.

try:
    spark.streams.awaitAnyTermination(900)
finally:
    # Stop all streams
    df_amalgamated.stop()
    df_input_upload.stop()
    df_create_upload.stop()
    df_upload.stop()
    print("All streams have been stopped")


df_input_upload = spark.read.format("delta").load(input_upload_responses_path)
 
# # Read the response data for create_record_upload
df_create_record_upload = spark.read.format("delta").load(create_record_responses_path)
 
# # Read the response data for upload_file
df_upload_file_upload = spark.read.format("delta").load(upload_file_responses_path) 
 
# Read the response data for df_amalgamated_responses
df_amalgamated_responses = spark.read.format("delta").load(amalgamated_responses_path)

print(f"Records in df_input_upload: {df_input_upload.count()}")
print(f"Records in df_create_record_upload: {df_create_record_upload.count()}")
print(f"Records in df_upload_file_upload: {df_upload_file_upload.count()}")
print(f"Records in df_amalgamated_responses: {df_amalgamated_responses.count()}")

In [0]:
dbutils.notebook.exit("Notebook execution completed successfully.")