In [0]:
from azure.storage.blob import BlobServiceClient
from pyspark.sql.functions import col, decode, split, element_at,udf,input_file_name,regexp_extract,expr
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, LongType


In [0]:
import time

time.sleep(600)

In [0]:

expected_schema = StructType([
    StructField("operation", StringType(), True),
    StructField("transaction_id", StringType(), True),
    StructField("relation_id", StringType(), True),
    StructField("a360_record_id", StringType(), True),
    StructField("process_time", TimestampType(), True),
    StructField("status", IntegerType(), True),
    StructField("input", StringType(), True),  # Contains nested JSON as a string
    StructField("exception_description", StringType(), True),
    StructField("error_status", StringType(), True),
    StructField("a360_file_id", StringType(), True),
    StructField("file_size", LongType(), True),
    StructField("s_md5", StringType(), True),
    StructField("s_sha256", StringType(), True),
    StructField("timestamp", TimestampType(), True),  # may be used as process_time
    StructField("filename", StringType(), True),
    StructField("submission_folder", StringType(), True),
    StructField("file_hash", StringType(), True)
])





In [0]:
#### Set up Auto Loader job

sas_token = dbutils.secrets.get(scope="ingest00-meta002-sbox", key="ARIAJR-SAS-TOKEN")
storage_account_name = "a360c2x2555dz"
container_name = "dropzone"
sub_dir = "ARIAJR/response"

input_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{sub_dir}"

spark.conf.set(
    f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net",
    sas_token
)

# schema_location = "/mnt/autoLoaderSchema/ARMJOH/response/read_stream"

# Define a file regex that matches files ending with .rsp
file_regex = ".*\\.rsp$"

output_container_name = "silver" 

output_storage_account_name = "ingest00curatedsbox"

output_subdir_amalgamated_responses = "ARIADM/ARM/response/JOH/amalgamated_responses"
amalgamated_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_amalgamated_responses}"



output_subdir_input_upload = "ARIADM/ARM/response/JOH/input_upload"
input_upload_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_input_upload}"


output_subdir_create_record_upload = "ARIADM/ARM/response/JOH/create_record"
create_record_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_create_record_upload}"


output_subdir_upload_file_upload = "ARIADM/ARM/response/JOH/upload_file"
upload_file_responses_path = f"wasbs://{output_container_name}@{output_storage_account_name}.blob.core.windows.net/{output_subdir_upload_file_upload}"

In [0]:
# Service principal credentials
client_id = dbutils.secrets.get("ingest00-meta002-sbox", "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get("ingest00-meta002-sbox", "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get("ingest00-meta002-sbox", "SERVICE-PRINCIPLE-TENANT-ID")

# Storage account names
curated_storage = "ingest00curatedsbox"
checkpoint_storage = "ingest00xcuttingsbox"


# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{checkpoint_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{checkpoint_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{checkpoint_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{checkpoint_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{checkpoint_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")


check_point_path = "abfss://db-ack-checkpoint@ingest00xcuttingsbox.dfs.core.windows.net/ARMJOH/RSP/"

schema_location = "abfss://db-ack-schema@ingest00xcuttingsbox.dfs.core.windows.net/ARMJOH/RSP/schema"

In [0]:
### Run autoloader read stream


df = (spark.readStream.format("cloudFiles")
      .schema(expected_schema)
    .option("cloudFiles.format", "json")
    .option("cloudFiles.schemaLocation", schema_location)
    .option("multiline", "true")
    .option("cloudFiles.schemaEvolutionMode", "none")
    # .option("checkpointLocation", "/mnt/autoLoaderSchema/ARMBail/response/read_stream")
    .load(input_path)
    .select( "*",col("_metadata.file_path").alias("_file_path"),
             col("_metadata.file_modification_time").alias("_file_modification_time")
)
    # .withColumn("file_name", regexp_extract(input_file_name(),"response\/(.*)",1))
    .withColumn("id", expr("uuid()")))
    

# display(df)

In [0]:
### 4 write streams to Silver container in blob storage

output_sas = dbutils.secrets.get(scope="ingest00-meta002-sbox", key="CURATED-SAS-TOKEN")

spark.conf.set(
    f"fs.azure.sas.{output_container_name}.{output_storage_account_name}.blob.core.windows.net",
    output_sas
)

### first save the complete table

df_complete = df.writeStream \
    .format("delta")\
        .option("checkpointLocation", f"{check_point_path}/amalgamated")\
            .outputMode("append")\
                .trigger(availableNow=True)\
                    .start(amalgamated_responses_path)

In [0]:
### Input upload table

df_input_upload = df.select("id","operation","timestamp","status","exception_description","error_status","filename","submission_folder","file_hash","_file_path","_file_modification_time"
).filter(col("operation")=="input_upload")\
    .writeStream \
    .format("delta")\
        .option("checkpointLocation", f"{check_point_path}/input_upload")\
            .outputMode("append")\
                .trigger(availableNow=True)\
                .start(input_upload_responses_path)

In [0]:
### Create Record table
df_create_upload = df.select("id","operation","transaction_id","relation_id","a360_record_id","process_time","status","input","exception_description","error_status","_file_path","_file_modification_time"
).filter(col("operation")=="create_record")\
    .writeStream \
    .format("delta")\
        .option("checkpointLocation", f"{check_point_path}/create_record")\
            .outputMode("append")\
                .trigger(availableNow=True)\
                .start(create_record_responses_path)

In [0]:
### upload file table
df_create_upload = df.select("id","operation","transaction_id","relation_id","a360_record_id","process_time","status","input","exception_description","error_status","a360_file_id","file_size","s_md5","s_sha256","_file_path","_file_modification_time"
).filter(col("operation")=="upload_new_file")\
    .writeStream \
    .format("delta")\
        .option("checkpointLocation", f"{check_point_path}/upload_file")\
            .outputMode("append")\
                .trigger(availableNow=True)\
                .start(upload_file_responses_path)