In [0]:
from azure.storage.blob import BlobServiceClient
from pyspark.sql.functions import col, decode, split, element_at,udf,input_file_name,regexp_extract,expr
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, LongType

In [0]:

expected_schema = StructType([
    StructField("operation", StringType(), True),
    StructField("transaction_id", StringType(), True),
    StructField("relation_id", StringType(), True),
    StructField("a360_record_id", StringType(), True),
    StructField("process_time", TimestampType(), True),
    StructField("status", IntegerType(), True),
    StructField("input", StringType(), True),  # Contains nested JSON as a string
    StructField("exception_description", StringType(), True),
    StructField("error_status", StringType(), True),
    StructField("a360_file_id", StringType(), True),
    StructField("file_size", LongType(), True),
    StructField("s_md5", StringType(), True),
    StructField("s_sha256", StringType(), True),
    StructField("timestamp", TimestampType(), True),  # may be used as process_time
    StructField("filename", StringType(), True),
    StructField("submission_folder", StringType(), True),
    StructField("file_hash", StringType(), True)
])

In [0]:
#### Set up Auto Loader job

sas_token = dbutils.secrets.get(scope="ingest00-meta002-sbox", key="ARIAJR-SAS-TOKEN")
storage_account_name = "a360c2x2555dz"
container_name = "dropzone"
sub_dir = "ARIATD/response"

input_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{sub_dir}"

spark.conf.set(
    f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net",
    sas_token
)

# schema_location = "/mnt/autoLoaderSchema/ARMJOH/response/read_stream"

# Define a file regex that matches files ending with .rsp
file_regex = ".*\\.rsp$"

output_container_name = "silver" 

output_storage_account_name = "ingest00curatedsbox"

output_subdir_amalgamated_responses = "ARIADM/ARM/response/TD/amalgamated_responses"
amalgamated_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_amalgamated_responses}"



output_subdir_input_upload = "ARIADM/ARM/response/TD/input_upload"
input_upload_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_input_upload}"


output_subdir_create_record_upload = "ARIADM/ARM/response/TD/create_record"
create_record_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_create_record_upload}"


output_subdir_upload_file_upload = "ARIADM/ARM/response/TD/upload_file"
upload_file_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_upload_file_upload}"

In [0]:
# Service principal credentials
client_id = dbutils.secrets.get("ingest00-meta002-sbox", "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get("ingest00-meta002-sbox", "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get("ingest00-meta002-sbox", "SERVICE-PRINCIPLE-TENANT-ID")

# Storage account names

checkpoint_storage = "ingest00xcuttingsbox"


# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{checkpoint_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{checkpoint_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{checkpoint_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{checkpoint_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{checkpoint_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")


spark.conf.set(f"fs.azure.account.auth.type.{output_storage_account_name}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{output_storage_account_name}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{output_storage_account_name}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{output_storage_account_name}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{output_storage_account_name}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")



check_point_path = "abfss://db-rsp-checkpoint@ingest00xcuttingsbox.dfs.core.windows.net/ARMTD/RSP/"

schema_location = "abfss://db-rsp-checkpoint@ingest00xcuttingsbox.dfs.core.windows.net/ARMTD/RSP/schema"

In [0]:
### Run autoloader read stream


df = (spark.readStream.format("cloudFiles")
      .schema(expected_schema)
    .option("cloudFiles.format", "json")
    .option("cloudFiles.schemaLocation", schema_location)
    .option("multiline", "true")
    .option("cloudFiles.schemaEvolutionMode", "none")
    .option("checkpointLocation", f"{check_point_path}/rsp_readStream")
             #"/mnt/autoLoaderSchema/ARMBail/response/read_stream")
    .load(input_path)
    .select( "*",col("_metadata.file_path").alias("_file_path"),
             col("_metadata.file_modification_time").alias("_file_modification_time")
)
    # .withColumn("file_name", regexp_extract(input_file_name(),"response\/(.*)",1))
    .withColumn("id", expr("uuid()")))

In [0]:
# dbutils.fs.rm("/mnt/autoLoaderSchema/ARMTD/response", recurse=True)

In [0]:
### 4 write streams to Silver container in blob storage

output_sas = dbutils.secrets.get(scope="ingest00-meta002-sbox", key="CURATED-SAS-TOKEN")

spark.conf.set(
    f"fs.azure.sas.{output_container_name}.{output_storage_account_name}.blob.core.windows.net",
    output_sas
)

### first save the complete table

df_complete = df.writeStream \
    .queryName("amalgamated") \
    .format("delta")\
        .option("checkpointLocation", f"{check_point_path}/amalgamated")\
            .outputMode("append")\
                    .start(amalgamated_responses_path)                #.trigger(availableNow=True)\

In [0]:
### Create Record table
df_create_upload = df.select("id","operation","transaction_id","relation_id","a360_record_id","process_time","status","input","exception_description","error_status","_file_path","_file_modification_time"
).filter(col("operation")=="create_record")\
    .writeStream \
    .queryName("create_record") \
    .format("delta")\
        .option("checkpointLocation", f"{check_point_path}/create_record")\
            .outputMode("append")\
                .start(create_record_responses_path)

In [0]:
### upload file table
df_upload = df.select("id","operation","transaction_id","relation_id","a360_record_id","process_time","status","input","exception_description","error_status","a360_file_id","file_size","s_md5","s_sha256","_file_path","_file_modification_time"
).filter(col("operation")=="upload_new_file")\
    .writeStream \
    .queryName("upload_new_file") \
    .format("delta")\
        .option("checkpointLocation", f"{check_point_path}/upload_file")\
            .outputMode("append")\
                .start(upload_file_responses_path)

In [0]:
### Input upload table

df_input_upload = df.select("id","operation","timestamp","status","exception_description","error_status","filename","submission_folder","file_hash","_file_path","_file_modification_time"
).filter(col("operation")=="input_upload")\
    .writeStream \
    .queryName("input_upload") \
    .format("delta")\
        .option("checkpointLocation", f"{check_point_path}/input_upload")\
            .outputMode("append")\
                .start(input_upload_responses_path)

In [None]:
# Wait 15 minutes for any stream to stop itself, or auto stop after 15mins.

try:
    spark.streams.awaitAnyTermination(900)
finally:
    # Stop all streams
    df_complete.stop()
    df_input_upload.stop()
    df_create_upload.stop()
    df_upload.stop()
    print("All streams have been stopped")


df_input_upload = spark.read.format("delta").load(input_upload_responses_path)
 
# # Read the response data for create_record_upload
df_create_record_upload = spark.read.format("delta").load(create_record_responses_path)
 
# # Read the response data for upload_file
df_upload_file_upload = spark.read.format("delta").load(upload_file_responses_path) 
 
# Read the response data for df_amalgamated_responses
df_amalgamated_responses = spark.read.format("delta").load(amalgamated_responses_path)

print(f"Records in df_input_upload: {df_input_upload.count()}")
print(f"Records in df_create_record_upload: {df_create_record_upload.count()}")
print(f"Records in df_upload_file_upload: {df_upload_file_upload.count()}")
print(f"Records in df_amalgamated_responses: {df_amalgamated_responses.count()}")

In [0]:
dbutils.notebook.exit("Notebook execution completed successfully.")

## Appendix

In [0]:

# from pyspark.sql.functions import col, from_unixtime

# files_df = spark.createDataFrame(dbutils.fs.ls("/mnt/dropzoneariatd/ARIATD/submission/"))
# files_df = files_df.withColumn("modificationTime", from_unixtime(col("modificationTime") / 1000).cast("timestamp"))

# display(files_df.orderBy(col("modificationTime").desc()))

In [0]:
# from pyspark.sql.functions import col, from_unixtime

# files_df = spark.createDataFrame(dbutils.fs.ls("/mnt/dropzoneariatd/ARIATD/submission/"))
# files_df = files_df.withColumn("modificationTime", from_unixtime(col("modificationTime") / 1000).cast("timestamp"))

# display(files_df.orderBy(col("modificationTime").desc()))

In [0]:
# from pyspark.sql.functions import col, from_unixtime

# files_df = spark.createDataFrame(dbutils.fs.ls("/mnt/dropzoneariatd/ARIATD/response/"))
# files_df = files_df.withColumn("modificationTime", from_unixtime(col("modificationTime") / 1000).cast("timestamp"))

# display(files_df.orderBy(col("modificationTime").desc()))

In [0]:


# json_count = files_df.filter(col("path").endswith(".json")).count()
# html_count = files_df.filter(col("path").endswith(".html")).count()
# a360_count = files_df.filter(col("path").endswith(".a360")).count()

# display(json_count)
# display(html_count)
# display(a360_count)

In [0]:
# dbutils.fs.ls("/mnt/dropzoneariatd/ARIATD/response")

In [0]:
# dbutils.fs.rm("/mnt/autoLoaderSchema/ARMTD/response", True)

In [0]:
# dbutils.fs.ls("/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/APPEALS/")

In [0]:
# dbutils.fs.rm("/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/APPEALS/", True)

In [0]:
# display(dbutils.fs.ls("/mnt/dropzoneariatd/ARIATD/response/"))

In [0]:
# dbutils.secrets.list(scope="ingest00-meta002-sbox")

In [0]:

# from azure.storage.blob import ContainerClient

# sas_token = dbutils.secrets.get(scope="ingest00-meta002-sbox", key="ARIATD-SAS-TOKEN")
# storage_account_name = "a360c2x2555dz"
# container_name = "dropzone"
# sub_dir = "ARIATD/submission"
# account_url = f"https://{storage_account_name}.blob.core.windows.net"


In [0]:
# html_df = spark.sql("""
# select CaseNo, HTMLContent as content_str, substring_index(HTMLFileName, '/', -1) as file_path from ariadm_arm_td.gold_td_iris_with_html
# where CaseNo = 'FY/46144/2015'
# """)

# json_df = spark.sql("""
#  select CaseNo, JSONcollection  as content_str, substring_index(JSONFileName, '/', -1) as file_path from ariadm_arm_td.gold_td_iris_with_Json
#   where CaseNo = 'FY/46144/2015';
# """)

# a360_df = spark.sql("""
# select CaseNo, A360Content  as content_str, substring_index(A360FileName, '/', -1) as file_path from ariadm_arm_td.gold_td_iris_with_a360
#  where CaseNo = 'FY/46144/2015';
# """)

# display(html_df)
# display(json_df)
# display(a360_df)

In [0]:

# from datetime import datetime

# results = []

# df = a360_df


# # Callback generator to capture responses for each file upload
# def capture_response(result):
#     def hook(response):
#         http_response = response.http_response
#         result["http_response"] = http_response.status_code
#         result["http_message"] = getattr(http_response, "reason", "No reason provided")
#         result["timestamp"] = datetime.now().isoformat()
#     return hook

# try:
#     # Create a ContainerClient using a container-level SAS token
#     container_client = ContainerClient(
#         account_url=account_url,
#         container_name=container_name,
#         credential=sas_token
#     )
#     print("Container client created")

#     for row in df.collect():
#         message = row.content_str
#         file_path = row.file_path

#         file_name= f"{sub_dir}/{file_path}"

#         blob_client = container_client.get_blob_client(file_name)

#         result = {
#             "filename": file_name,
#             "http_response": None,
#             "timestamp": None,
#             "http_message": None
#         }

#         print("uploading message to blob storage",file_name)
#         blob_client.upload_blob(message,overwrite=True,raw_response_hook=capture_response(result))
#         print("upload complete")

#         results.append(result)


# except Exception as e:
#     print("Failed to connect to blob storage:", e)



In [0]:
# display(dbutils.fs.ls("/mnt/dropzoneariatd/ARIATD/submission/"))

In [0]:
# dbutils.fs.ls("/mnt/dropzoneariatd/ARIATD/response")


In [0]:
# dbutils.fs.ls("/mnt/dropzoneariatd/ARIATD/submission/")

In [0]:
# from pyspark.sql.functions import col, from_unixtime

# files_df = spark.createDataFrame(dbutils.fs.ls("/mnt/dropzoneariajr/ARIAJR/submission/"))
# files_df = files_df.withColumn("modificationTime", from_unixtime(col("modificationTime") / 1000).cast("timestamp"))

# display(files_df.orderBy(col("modificationTime").desc()))

In [0]:
# files_count = files_df.filter(col("path").like("%.json") | col("path").like("%.html") | col("path").like("%.a360")).count()
# display(files_count)

In [0]:
# from pyspark.sql.functions import col, from_unixtime

# files_df = spark.createDataFrame(dbutils.fs.ls("/mnt/dropzoneariajr/ARIAJR/response"))
# files_df = files_df.withColumn("modificationTime", from_unixtime(col("modificationTime") / 1000).cast("timestamp"))

# display(files_df.orderBy(col("modificationTime").desc()))

In [0]:
# Define the paths to the tables
audit_delta_path = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/TD/td_cr_audit_table"
ack_path = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/TD/td_ack_audit"
output_subdir_input_upload = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/TD/input_upload"
output_subdir_create_record_upload = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/TD/create_record"
output_subdir_upload_file_upload = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/TD/upload_file"
output_subdir_amalgamated_responses = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/TD/amalgamated_responses"

# Read the Delta table for joh_cr_audit_table
df_audit = spark.read.format("delta").load(audit_delta_path)

# Read the Delta table for joh_ack_audit
df_ack = spark.read.format("delta").load(ack_path)

# Read the response data for input_upload
# df_input_upload = spark.read.format("parquet").load(output_subdir_input_upload)

# # Read the response data for create_record_upload
# df_create_record_upload = spark.read.format("parquet").load(output_subdir_create_record_upload)

# # Read the response data for upload_file
# df_upload_file_upload = spark.read.format("parquet").load(output_subdir_upload_file_upload)


# Read the response data for df_amalgamated_responses
# df_amalgamated_responses = spark.read.format("delta").load(output_subdir_amalgamated_responses)

# Display the DataFrames using Databricks display
display(df_audit)
display(df_ack)
# display(df_input_upload)
# display(df_create_record_upload)
# display(df_upload_file_upload)
# display(df_amalgamated_responses)