In [0]:
from azure.storage.blob import BlobServiceClient
from pyspark.sql.functions import col, decode, split, element_at,udf,input_file_name,regexp_extract,expr
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, LongType


In [0]:
import time

# time.sleep(600)

In [0]:

expected_schema = StructType([
    StructField("operation", StringType(), True),
    StructField("transaction_id", StringType(), True),
    StructField("relation_id", StringType(), True),
    StructField("a360_record_id", StringType(), True),
    StructField("process_time", TimestampType(), True),
    StructField("status", IntegerType(), True),
    StructField("input", StringType(), True),  # Contains nested JSON as a string
    StructField("exception_description", StringType(), True),
    StructField("error_status", StringType(), True),
    StructField("a360_file_id", StringType(), True),
    StructField("file_size", LongType(), True),
    StructField("s_md5", StringType(), True),
    StructField("s_sha256", StringType(), True),
    StructField("timestamp", TimestampType(), True),  # may be used as process_time
    StructField("filename", StringType(), True),
    StructField("submission_folder", StringType(), True),
    StructField("file_hash", StringType(), True)
])

In [None]:
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()

print(f"env_code: {lz_key}")  # This won't be redacted
print(f"env_name: {env_name}")  # This won't be redacted

KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
print(f"KeyVault_name: {KeyVault_name}") 

In [0]:
#### Set up Auto Loader job

sas_token = dbutils.secrets.get(scope=KeyVault_name, key="ARIAFTA-SAS-TOKEN")
storage_account_name = "a360c2x2555dz"
container_name = "dropzone"
sub_dir = "ARIAFTA/response"

input_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{sub_dir}"

spark.conf.set(
    f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net",
    sas_token
)

# schema_location = "/mnt/autoLoaderSchema/ARMJOH/response/read_stream"

# Define a file regex that matches files ending with .rsp
file_regex = ".*\\.rsp$"

output_container_name = "silver" 

output_storage_account_name = f"ingest{lz_key}curated{env_name}"

output_subdir_amalgamated_responses = "ARIADM/ARM/response/APPEALS/ARIAFTA/amalgamated_responses"
amalgamated_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_amalgamated_responses}"



output_subdir_input_upload = "ARIADM/ARM/response/APPEALS/ARIAFTA/input_upload"
input_upload_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_input_upload}"


output_subdir_create_record_upload = "ARIADM/ARM/response/APPEALS/ARIAFTA/create_record"
create_record_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_create_record_upload}"


output_subdir_upload_file_upload = "ARIADM/ARM/response/APPEALS/ARIAFTA/upload_file"
upload_file_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_upload_file_upload}"

In [0]:
# Service principal credentials
client_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-TENANT-ID")

# Storage account names

checkpoint_storage = f"ingest{lz_key}xcutting{env_name}"


# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{checkpoint_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{checkpoint_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{checkpoint_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{checkpoint_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{checkpoint_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

check_point_path = f"abfss://db-rsp-checkpoint@ingest{lz_key}xcutting{env_name}.dfs.core.windows.net/APPEALS/ARIAFTA/RSP/"

schema_location = f"abfss://db-rsp-checkpoint@inges{lz_key}0xcutting{env_name}.dfs.core.windows.net/APPEALS/ARIAFTA/RSP/schema"

In [0]:
### Run autoloader read stream


df = (spark.readStream.format("cloudFiles")
      .schema(expected_schema)
    .option("cloudFiles.format", "json")
    .option("cloudFiles.schemaLocation", schema_location)
    .option("multiline", "true")
    .option("cloudFiles.schemaEvolutionMode", "none")
    .option("checkpointLocation", f"{check_point_path}/rsp_readStream")
    .load(input_path)
    .select( "*",col("_metadata.file_path").alias("_file_path"),
             col("_metadata.file_modification_time").alias("_file_modification_time")
)
    # .withColumn("file_name", regexp_extract(input_file_name(),"response\/(.*)",1))
    .withColumn("id", expr("uuid()")))
    
# df = df.cache()
# display(df)

In [0]:
df_cr = df.filter(col("operation")=="create_record")
df_iu = df.filter(col("operation")=="input_upload")
df_uf = df.filter(col("operation")=="upload_new_file")

In [0]:
### 4 write streams to Silver container in blob storage

output_sas = dbutils.secrets.get(scope=KeyVault_name, key="CURATED-SAS-TOKEN")

spark.conf.set(
    f"fs.azure.sas.{output_container_name}.{output_storage_account_name}.blob.core.windows.net",
    output_sas
)

### first save the complete table

df_complete = df.writeStream \
    .format("delta")\
        .option("checkpointLocation", f"{check_point_path}/amalgamated")\
            .outputMode("append")\
                    .start(amalgamated_responses_path)

time.sleep(60)

df_complete.stop()



In [0]:
### Input upload table

df_input_upload = df_iu.select("id","operation","timestamp","status","exception_description","error_status","filename","submission_folder","file_hash","_file_path","_file_modification_time"
)\
    .writeStream \
    .format("delta")\
        .option("checkpointLocation", f"{check_point_path}/input_upload")\
            .outputMode("append")\
                .start(input_upload_responses_path)
time.sleep(60)
df_input_upload.stop()     



In [0]:
# ### Create Record table
# df_create_upload = df.select("id","operation","transaction_id","relation_id","a360_record_id","process_time","status","input","exception_description","error_status","_file_path","_file_modification_time"
# ).filter(col("operation")=="create_record")\
#     .writeStream \
#     .format("delta")\
#         .option("checkpointLocation", f"{check_point_path}/create_record")\
#             .outputMode("append")\
#                 .start(create_record_responses_path)

### Create Record table
df_create_upload = df_cr.select("id","operation","transaction_id","relation_id","a360_record_id","process_time","status","input","exception_description","error_status","_file_path","_file_modification_time"
).writeStream \
    .format("delta")\
        .option("checkpointLocation", f"{check_point_path}/create_record")\
            .outputMode("append")\
                .start(create_record_responses_path)

time.sleep(60)
df_create_upload.stop()

In [0]:
### upload file table
df_create_upload = df_uf.select("id","operation","transaction_id","relation_id","a360_record_id","process_time","status","input","exception_description","error_status","a360_file_id","file_size","s_md5","s_sha256","_file_path","_file_modification_time"
)\
    .writeStream \
    .format("delta")\
        .option("checkpointLocation", f"{check_point_path}/upload_file")\
            .outputMode("append")\
                .start(upload_file_responses_path)

time.sleep(60)
df_create_upload.stop()

In [0]:
df_amalgamated_test = spark.read.format("delta").load(amalgamated_responses_path)

df_amalgamated_test.display()

In [0]:
dbutils.notebook.exit("Notebook completed successfully")

## Appendix

In [0]:
# dbutils.fs.ls("/mnt/dropzoneariafta/ARIAFTA/submission/")

In [0]:
# from pyspark.sql.functions import col, from_unixtime

# files_df = spark.createDataFrame(dbutils.fs.ls("/mnt/dropzoneariafta/ARIAFTA/submission/"))
# files_df = files_df.withColumn("modificationTime", from_unixtime(col("modificationTime") / 1000).cast("timestamp"))

# display(files_df.orderBy(col("modificationTime").desc()))

In [0]:
# from pyspark.sql.functions import col, from_unixtime

# files_df = spark.createDataFrame(dbutils.fs.ls("/mnt/dropzoneariafta/ARIAFTA/response/"))
# files_df = files_df.withColumn("modificationTime", from_unixtime(col("modificationTime") / 1000).cast("timestamp"))

# display(files_df.orderBy(col("modificationTime").desc()))

In [0]:
# from pyspark.sql.functions import col, from_unixtime

# files_df = spark.createDataFrame(dbutils.fs.ls("/mnt/dropzoneariafta/ARIAFTA/response/"))
# files_df = files_df.withColumn("modificationTime", from_unixtime(col("modificationTime") / 1000).cast("timestamp"))
# files_df = files_df.filter(col("path").contains("_0_"))

# display(files_df.orderBy(col("modificationTime").desc()))

In [0]:
# # Define the paths to the tables
# # audit_delta_path = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/APPEALS/ARIAFTA/apl_fta_cr_audit_table"
# # ack_path = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/APPEALS/ARIAFTA/fta_ack_audit"
# output_subdir_input_upload = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/APPEALS/ARIAFTA/input_upload"
# output_subdir_create_record_upload = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/APPEALS/ARIAFTA/create_record"
# output_subdir_upload_file_upload = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/APPEALS/ARIAFTA/upload_file"
# # output_subdir_amalgamated_responses = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/response/APPEALS/ARIAFTA/amalgamated_responses"

# # https://ingest00curatedsbox.blob.core.windows.net/silver/ARIADM/ARM/response/APPEALS/ARIAFTA/create_record/part-00000-27c33bc4-48d2-48fd-9949-e5a0a50c148f-c000.snappy.parquet

# # Read the Delta table for joh_cr_audit_table
# # df_audit = spark.read.format("delta").load(audit_delta_path)

# # # Read the Delta table for joh_ack_audit
# # df_ack = spark.read.format("delta").load(ack_path)

# # Read the response data for input_upload
# df_input_upload = spark.read.format("delta").load(output_subdir_input_upload)

# # Read the response data for create_record_upload
# df_create_record_upload = spark.read.format("delta").load(output_subdir_create_record_upload)

# # Read the response data for upload_file
# df_upload_file_upload = spark.read.format("delta").load(output_subdir_upload_file_upload)


# # Read the response data for df_amalgamated_responses
# # df_amalgamated_responses = spark.read.format("delta").load(output_subdir_amalgamated_responses)

# # Display the DataFrames using Databricks display
# # display(df_audit)
# # display(df_ack)
# display(df_input_upload)
# display(df_create_record_upload)
# display(df_upload_file_upload)
# # display(df_amalgamated_responses)