In [0]:
!pip install /dbfs/FileStore/packages/shared_functions-0.5.2-py3-none-any.whl
dbutils.library.restartPython()

In [None]:
import shared_functions.paymentPending as PP
from shared_functions.DQRules import base_DQRules, build_rule_expression

In [0]:
import json
from datetime import datetime
from pyspark.sql.functions import *
import uk_postcodes_parsing
from pyspark.sql import functions as F
import os

In [0]:
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()

print(f"env_code: {lz_key}")  # This won't be redacted
print(f"env_name: {env_name}")  # This won't be redacted

KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
print(f"KeyVault_name: {KeyVault_name}") 

In [0]:
# Service principal credentials
client_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-TENANT-ID")

# Storage account names
curated_storage = f"ingest{lz_key}curated{env_name}"
checkpoint_storage = f"ingest{lz_key}xcutting{env_name}"
raw_storage = f"ingest{lz_key}raw{env_name}"
landing_storage = f"ingest{lz_key}landing{env_name}"
external_storage = f"ingest{lz_key}external{env_name}"


# Spark config for curated storage (Delta table)
spark.conf.set(f"fs.azure.account.auth.type.{curated_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{curated_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{curated_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{curated_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{curated_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{checkpoint_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{checkpoint_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{checkpoint_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{checkpoint_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{checkpoint_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{raw_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{raw_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{raw_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{raw_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{raw_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{landing_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{landing_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{landing_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{landing_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{landing_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")


# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{external_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{external_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{external_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{external_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{external_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
AppealState = "reasonsForAppealSubmitted"
output_name = "reasons_for_appeal_submitted"

# Setting variables for use in subsequent cells
bronze_path = f"abfss://bronze@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/"
silver_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/"
audit_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/AUDIT/{AppealState}"
gold_outputs = f"ARIADM/ACTIVE/CCD/APPEALS/{AppealState}"

# Print all variables
variables = {
    # "read_hive": read_hive,
    
    "bronze_path": bronze_path,
    "silver_path": silver_path,
    "audit_path": audit_path,
    "gold_outputs": gold_outputs,
    "key_vault": KeyVault_name,
    "AppealState": AppealState

}

display(variables)

In [0]:
import importlib
import shared_functions.paymentPending as PP
import shared_functions.appealSubmitted as APS
import shared_functions.AwaitingEvidenceRespondant_a as AERa
import shared_functions.AwaitingEvidenceRespondant_b as AERb
import shared_functions.reasonsForAppealSubmitted as RFPAS
import shared_functions.caseUnderReview as CUR
from shared_functions.DQRules import base_DQRules, build_rule_expression

In [0]:
from pyspark.sql.functions import col, lit

silver_m1 = spark.table("ariadm_active_appeals.silver_appealcase_detail").filter(col("dv_targetState") == lit(AppealState)).distinct()
silver_m2 = spark.table("ariadm_active_appeals.silver_caseapplicant_detail") 
silver_m3 = spark.table("ariadm_active_appeals.silver_status_detail").filter(col("dv_targetState") == lit(AppealState))
silver_m4 = spark.table("ariadm_active_appeals.silver_transaction_detail").filter(col("dv_targetState") == lit(AppealState))
silver_m5 = spark.table("ariadm_active_appeals.silver_link_detail").filter(col("dv_targetState") == lit(AppealState))
silver_m6 = spark.table("ariadm_active_appeals.silver_adjudicator_detail").filter(col("dv_targetState") == lit(AppealState))
silver_c = spark.table("ariadm_active_appeals.silver_appealcategory_detail")
bronze_countryFromAddress = spark.table("ariadm_active_appeals.bronze_countries_countryFromAddress").withColumn("lu_countryGovUkOocAdminJ",col("countryGovUkOocAdminJ"))
bronze_HORef_cleansing = spark.table("ariadm_active_appeals.bronze_HORef_cleansing")
bronze_remission_lookup_df = spark.table("ariadm_active_appeals.bronze_remissions").distinct()

silver_h = spark.table("ariadm_active_appeals.silver_history_detail").filter(col("dv_targetState") == lit(AppealState))
bronze_remissions_lookup_df = spark.table("ariadm_active_appeals.bronze_remissions").distinct()
bronze_countryFromAddress = spark.table("ariadm_active_appeals.bronze_countries_countryFromAddress")
bronze_HORef_cleansing = spark.table("ariadm_active_appeals.bronze_HORef_cleansing")
bronze_hearing_centres = spark.table("ariadm_active_appeals.bronze_hearing_centres")
bronze_derive_hearing_centres = spark.table("ariadm_active_appeals.bronze_derive_hearing_centres")

In [0]:
df, df_audit = RFPAS.appealType(silver_m1)
# display(df)

In [0]:
df,df_audit = RFPAS.caseData(silver_m1, silver_m2, silver_m3, silver_h, bronze_hearing_centres, bronze_derive_hearing_centres)
# display(df)

In [0]:
df, df_audit = RFPAS.flagsLabels(silver_m1, silver_m2, silver_c)
# display(df)

In [0]:
df, df_audit = RFPAS.legalRepDetails(silver_m1)
# display(df)

In [0]:
df, df_audit = RFPAS.appellantDetails(silver_m1, silver_m2, silver_c, bronze_countryFromAddress,bronze_HORef_cleansing)
# display(df)

In [0]:
df, df_audit = RFPAS.homeOfficeDetails(silver_m1, silver_m2, silver_c, bronze_HORef_cleansing)
# display(df)

In [0]:
df, df_audit = RFPAS.paymentType(silver_m1, silver_m4)
# display(df)

In [0]:
df, df_audit = RFPAS.partyID(silver_m1, silver_m3, silver_c)
# display(df)

In [0]:
df, df_audit = RFPAS.remissionTypes(silver_m1, bronze_remission_lookup_df,silver_m4)
# display(df)

In [0]:
df, df_audit = RFPAS.sponsorDetails(silver_m1, silver_c)
# display(df)

In [0]:
df, df_audit = RFPAS.hearingResponse(silver_m1, silver_m3, silver_m6)
# display(df)
# display(df_audit)

In [0]:
df, df_audit = RFPAS.general(silver_m1, silver_m2, silver_m3, silver_h, bronze_hearing_centres, bronze_derive_hearing_centres)
# display(df)

In [0]:
df = RFPAS.generalDefault(silver_m1)
# display(df)

In [0]:
df, df_audit = RFPAS.documents(silver_m1)
# display(df)

In [0]:
df, df_audit = PP.caseState(silver_m1,"reasonsForAppealSubmitted")
# display(df)

In [0]:
from pyspark.sql.types import StringType, StructType, ArrayType, MapType
from pyspark.sql.functions import col, lit, to_json, struct, concat, regexp_replace
from datetime import datetime

def mainPaymentPending(silver_segmentation, silver_m1, silver_m2, silver_m3, silver_c,silver_h, bronze_remissions, bronze_countryFromAddress, bronze_HORef_cleansing,bronze_hearing_centres,bronze_derive_hearing_centres):
    AppealState = "reasonsForAppealSubmitted"

    # Aggregate details
    AppealType_df, AppealType_df_audit = RFPAS.appealType(silver_m1)
    caseData_df, caseData_df_audit = RFPAS.caseData(silver_m1, silver_m2, silver_m3, silver_h, bronze_hearing_centres, bronze_derive_hearing_centres)
    flagsLabels_df, flagsLabels_df_audit = RFPAS.flagsLabels(silver_m1, silver_m2, silver_c)
    appellantDetails_df, appellantDetails_df_audit = RFPAS.appellantDetails(silver_m1, silver_m2, silver_c, bronze_countryFromAddress,bronze_HORef_cleansing)
    legalRepDetails_df, legalRepDetails_df_audit = RFPAS.legalRepDetails(silver_m1)
    partyID_df, partyID_df_audit = RFPAS.partyID(silver_m1, silver_m3, silver_c)
    payment_df, payment_df_audit = RFPAS.paymentType(silver_m1, silver_m4)
    homeOfficeDetails_df, homeOfficeDetails_df_audit = RFPAS.homeOfficeDetails(silver_m1, silver_m2, silver_c, bronze_HORef_cleansing)
    remissionTypes_df, remissionTypes_df_audit = RFPAS.remissionTypes(silver_m1, bronze_remissions, silver_m4)
    sponsorDetails_df, sponsorDetails_df_audit = RFPAS.sponsorDetails(silver_m1, silver_c)
    general_df, general_df_audit = RFPAS.general(silver_m1, silver_m2, silver_m3, silver_h, bronze_hearing_centres, bronze_derive_hearing_centres)
    generalDefault_df = RFPAS.generalDefault(silver_m1)
    documents_df, documents_df_audit = RFPAS.documents(silver_m1)
    caseState_df, caseState_df_audit = PP.caseState(silver_m1, "reasonsForAppealSubmitted")
    hearingResponse_df, hearingResponse_df_audit = RFPAS.hearingResponse(silver_m1, silver_m3, silver_m6)
    silver_segmentation_df = silver_segmentation

    # Join all aggregated data with Appeal Case Details
    df_combined = (
        silver_segmentation_df.join(AppealType_df, on="CaseNo", how="left")
        .join(caseData_df, on="CaseNo", how="left")
        .join(flagsLabels_df, on="CaseNo", how="left")
        .join(appellantDetails_df, on="CaseNo", how="left")
        .join(legalRepDetails_df, on="CaseNo", how="left")
        .join(sponsorDetails_df, on="CaseNo", how="left")
        .join(partyID_df, on="CaseNo", how="left")
        .join(payment_df, on="CaseNo", how="left")
        .join(remissionTypes_df, on="CaseNo", how="left")
        .join(homeOfficeDetails_df, on="CaseNo", how="left")
        .join(caseState_df, on="CaseNo", how="left")
        .join(hearingResponse_df, on="CaseNo", how="left")
        .join(general_df, on="CaseNo", how="left")
        .join(generalDefault_df, on="CaseNo", how="left")
        .join(documents_df, on="CaseNo", how="left")
    
    )

    # Join all aggregated data with Appeal Case Details
    df_combined_audit = (
        silver_segmentation_df.join(AppealType_df_audit, on="CaseNo", how="left")
        .join(caseData_df_audit, on="CaseNo", how="left")
        .join(flagsLabels_df_audit, on="CaseNo", how="left")
        .join(appellantDetails_df_audit, on="CaseNo", how="left")
        .join(legalRepDetails_df_audit, on="CaseNo", how="left")
        .join(sponsorDetails_df_audit, on="CaseNo", how="left")
        .join(partyID_df_audit, on="CaseNo", how="left")
        .join(payment_df_audit, on="CaseNo", how="left")
        .join(remissionTypes_df_audit, on="CaseNo", how="left")
        .join(homeOfficeDetails_df_audit, on="CaseNo", how="left")
        .join(caseState_df_audit, on="CaseNo", how="left")
        .join(hearingResponse_df_audit, on="CaseNo", how="left")
        .join(general_df_audit, on="CaseNo", how="left")
        .join(documents_df_audit, on="CaseNo", how="left")
    )

    Datetime_name = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

    # Create JSON and filename and omit columns that are with null values
    df_final = df_combined.withColumn(
        "JSON_Content", to_json(struct(*df_combined.drop(col("CaseNo")).columns))
    ).withColumn(
        "JSON_File_name", concat(lit(f"{gold_outputs}/{Datetime_name}/JSON/APPEALS_"), regexp_replace(col("CaseNo"), "/", "_"), lit(".json"))
    )
    
    return df_final, df_combined_audit

########### Test ##########

silver_m1 = spark.table("ariadm_active_appeals.silver_appealcase_detail").filter(col("dv_targetState") == lit(AppealState)).distinct()
silver_m2 =  spark.table("ariadm_active_appeals.silver_caseapplicant_detail").filter(col("dv_targetState") == lit(AppealState))
silver_m3 = spark.table("ariadm_active_appeals.silver_status_detail").filter(col("dv_targetState") == lit(AppealState))
silver_c = spark.table("ariadm_active_appeals.silver_appealcategory_detail").filter(col("dv_targetState") == lit(AppealState))
bronze_remissions = spark.table("ariadm_active_appeals.bronze_remissions").distinct()
silver_segmentation = spark.table("ariadm_active_appeals.stg_segmentation_states").filter(col("TargetState") == lit(AppealState))

bronze_countryFromAddress = spark.table("ariadm_active_appeals.bronze_countries_countryFromAddress").withColumn("lu_countryGovUkOocAdminJ",col("countryGovUkOocAdminJ"))

bronze_HORef_cleansing = spark.table("ariadm_active_appeals.bronze_HORef_cleansing")

df_final, df_audit = mainPaymentPending(silver_segmentation, silver_m1, silver_m2, silver_m3, silver_c,silver_h, bronze_remissions, bronze_countryFromAddress, bronze_HORef_cleansing,bronze_hearing_centres,bronze_derive_hearing_centres)

display(df_final)

### Function: Upload  and Blob Client Connection Configuration

In [0]:
secret = dbutils.secrets.get(KeyVault_name, f"CURATED-{env_name}-SAS-TOKEN")

In [0]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import os

# Set up the BlobServiceClient with your connection string
connection_string = secret

blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Specify the container name
container_name = "gold"
container_client = blob_service_client.get_container_client(container_name)

In [0]:
# Upload HTML to Azure Blob Storage
def upload_to_blob(file_name, file_content):
    try:
        # blob_client = container_client.get_blob_client(f"{gold_outputs}/HTML/{file_name}")
        blob_client = container_client.get_blob_client(f"{file_name}")
        blob_client.upload_blob(file_content, overwrite=True)
        return "success"
    except Exception as e:
        return f"error: {str(e)}"

# Register the upload function as a UDF
upload_udf = udf(upload_to_blob)

# df_with_upload_status = df_final.withColumn(
#     "Status", upload_udf(col("JSON_File_name"), col("JSON_Content"))
# )

# display(df_with_upload_status)


## Gold Outputs and Tracking DLT Table Creation

In [0]:
checks = {}
checks = base_DQRules()

checks["valid_additionalInstructionsTribunalResponse"] = (
  "((additionalInstructionsTribunalResponse IS NOT NULL and CaseStatus = '26') OR (additionalInstructionsTribunalResponse IS NULL and CaseStatus != '26'))"
)

dq_rules = build_rule_expression(checks)
dq_rules

In [0]:
import dlt
from pyspark.sql.functions import col, lit, expr
from pyspark.sql import Window

@dlt.table(
    name=f"stg_main_{output_name}_validation",
    comment="DLT table running mainPaymentPending to generate a JSON_Content column for CCD validation. Applies DLT expectations on CCD, adding is_valid to flag validation results.",
    path=f"{audit_path}/stg_main_{output_name}_validation"
)
@dlt.expect_all(checks)
def stg_main_reasonsForAppealSubmitted_validation():
    try:
        silver_m1 = dlt.read("silver_appealcase_detail").filter(col("dv_targetState") == lit(AppealState)).distinct()
        bronze_appealtype_lookup_df = dlt.read("bronze_appealtype").distinct()
        bronze_hearing_centres_lookup_df = dlt.read("bronze_hearing_centres").distinct()
        # stg_representation = dlt.read("stg_representation").select(col("Representation").alias("valid_representation"))
        silver_m2 = dlt.read("silver_caseapplicant_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_m3 = dlt.read("silver_status_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_m4 = dlt.read("silver_transaction_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_m5 = dlt.read("silver_link_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_m6 = dlt.read("silver_adjudicator_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_c = dlt.read("silver_appealcategory_detail").filter(col("dv_targetState") == lit(AppealState))
        # silver_m8 = dlt.read("silver_documentsreceived_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_h = dlt.read("silver_history_detail").filter(col("dv_targetState") == lit(AppealState))
        bronze_countries_postal_lookup_df = dlt.read("bronze_countries_postal").distinct()
        bronze_remissions = dlt.read("bronze_remissions").distinct()
        bronze_countryFromAddress = dlt.read("bronze_countries_countryFromAddress")
        bronze_HORef_cleansing = dlt.read("bronze_HORef_cleansing")
        bronze_hearing_centres = dlt.read("bronze_hearing_centres")
        bronze_derive_hearing_centres = dlt.read("bronze_derive_hearing_centres")
        silver_segmentation = dlt.read("stg_segmentation_states").filter(col("TargetState") == lit(AppealState))
        
    except:
        silver_m1 = spark.table("ariadm_active_appeals.silver_appealcase_detail").filter(col("dv_targetState") == lit(AppealState)).distinct()
        bronze_appealtype_lookup_df = spark.table("ariadm_active_appeals.bronze_appealtype").distinct()
        bronze_hearing_centres_lookup_df = spark.table("ariadm_active_appeals.bronze_hearing_centres").distinct()
        # stg_representation = spark.table("ariadm_active_appeals.stg_representation").select(col("Representation").alias("valid_representation"))
        silver_m2 = spark.table("ariadm_active_appeals.silver_caseapplicant_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_m3 = spark.table("ariadm_active_appeals.silver_status_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_m4 = spark.table("ariadm_active_appeals.silver_transaction_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_m5 = spark.table("ariadm_active_appeals.silver_link_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_m6 = spark.table("ariadm_active_appeals.silver_adjudicator_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_c = spark.table("ariadm_active_appeals.silver_appealcategory_detail").filter(col("dv_targetState") == lit(AppealState))
        # silver_m8 = spark.table("ariadm_active_appeals.silver_documentsreceived_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_h = spark.table("ariadm_active_appeals.silver_history_detail").filter(col("dv_targetState") == lit(AppealState))
        bronze_countries_postal_lookup_df = spark.table("ariadm_active_appeals.bronze_countries_postal").distinct() 
        bronze_remissions = spark.table("ariadm_active_appeals.bronze_remissions").distinct()
        bronze_countryFromAddress = spark.table("ariadm_active_appeals.bronze_countries_countryFromAddress")
        bronze_HORef_cleansing = spark.table("ariadm_active_appeals.bronze_HORef_cleansing")
        bronze_hearing_centres = spark.table("ariadm_active_appeals.bronze_hearing_centres")
        bronze_derive_hearing_centres = spark.table("ariadm_active_appeals.bronze_derive_hearing_centres")
        silver_segmentation = spark.table("ariadm_active_appeals.stg_segmentation_states").filter(col("TargetState") == lit(AppealState))

    df_final,df_audit = mainPaymentPending(silver_segmentation, silver_m1, silver_m2, silver_m3, silver_c,silver_h, bronze_remissions, bronze_countryFromAddress, bronze_HORef_cleansing,bronze_hearing_centres,bronze_derive_hearing_centres)

    valid_representation = silver_m1.select(col("CaseNo"), col("dv_representation"),col("dv_CCDAppealType"),col("CaseRep_Address5"), col("CaseRep_Postcode"),col("MainRespondentId"), col("lu_appealType"), col("HORef"),col("Sponsor_Authorisation"),col("Sponsor_Name")) 
    valid_appealant_address = silver_m2.select(col("CaseNo"), col("Appellant_Address1"), col("Appellant_Address2"),col("Appellant_Address3"),("Appellant_Address4"), col("Appellant_Address5"), col("Appellant_Postcode"),col("Appellant_Email"),col("Appellant_Telephone"), col("FCONumber")).filter(col("Relationship").isNull())
    valid_country_list = bronze_countries_postal_lookup_df.select(col("countryGovUkOocAdminJ").alias("valid_countryGovUkOocAdminJ")).distinct()
    valid_catagoryid_list = silver_c.groupBy("CaseNo").agg(F.collect_list("CategoryId").alias("valid_categoryIdList"))
    valid_HORef_cleansing = bronze_HORef_cleansing.select( col("CaseNo"),coalesce(col("HORef"), col("FCONumber")).alias("lu_HORef"))
    valid_reasonDescription = silver_m1.alias("m1").join(bronze_remissions, on=["PaymentRemissionReason","PaymentRemissionRequested"], how="left").select("CaseNo", "ReasonDescription",col("remissionClaim").alias("lu_remissionClaim"),col("feeRemissionType").alias("lu_feeRemissionType"))
    window_spec = Window.partitionBy("CaseNo").orderBy(desc("StatusId"))
    valid_caseStatus = (silver_m3.select(col("CaseNo"),col("CaseStatus"), col("StatusId"), row_number().over(window_spec).alias("rn")
                                         ).filter(col("rn") == 1)  # keep only the latest row per CaseNo
    .drop("rn","StatusId"))
    valid_statusId = silver_m3.select(col("CaseNo"), col("StatusId"), col('CaseStatus'), col('Outcome'))
 
    df_final = df_final.join(valid_representation, on="CaseNo", how="left"
                            ).join(valid_country_list, on=col("CaseRep_Address5") == col("valid_countryGovUkOocAdminJ"), how="left"
                            ).join(valid_catagoryid_list, on="CaseNo", how="left"
                            ).join(valid_appealant_address, on="CaseNo", how="left"
                            ).join(valid_HORef_cleansing, on="CaseNo", how="left"
                            ).join(valid_reasonDescription, on="CaseNo", how="left"
                            ).join(valid_statusId, on="CaseNo", how="left")
 
    df_final = df_final.withColumn("is_valid", expr(dq_rules))

    # df_final = df_final.drop(col("dv_representation"), col("CaseRepAddress5"), col("CaseRepPostcode"), col("valid_countryGovUkOocAdminJ"))

    # columns_to_drop = ["dv_representation", "CaseRepAddress5", "CaseRepPostcode"]

    # if all(col in df_final.columns for col in columns_to_drop): #If the columns exist - remove 
    #     df_final = df_final.drop(*columns_to_drop)

    return df_final

In [0]:
@dlt.table(
    name=f"stg_valid_{output_name}_records",
    comment="Delta Live Gold Table with JSON content.",
    path=f"{audit_path}/stg_valid_{output_name}_records"
)
def stg_valid_reasonsforAppealSubmitted_records():
    """
    Delta Live Table for creating and uploading JSON content for Appeals.
    """
    # Load source data
    df = dlt.read(f"stg_main_{output_name}_validation")

    df_filtered = df.filter(
        (col("is_valid") == True)
    )

    # Repartition to optimize parallelism
    repartitioned_df = df_filtered.repartition(64)

    df_with_upload_status = repartitioned_df.filter(~col("JSON_content").like("Error%")).withColumn(
            "Status", upload_udf(col("JSON_File_Name"), col("JSON_content"))
        )

    # Return the DataFrame for DLT table creation
    return df_with_upload_status.select("CaseNo", "JSON_content",col("JSON_File_Name").alias("File_Name"),"Status")


In [0]:
@dlt.table(
    name=f"stg_invalid_{output_name}_quarantine_records",
    comment="Quarantined records that failed data quality checks or JSON generation.",
    path=f"{audit_path}/stg_invalid_{output_name}_quarantine_records"
)
def stg_invalid_reasonsForAppealSubmitted_quarantine_records():

    df = dlt.read(f"stg_main_{output_name}_validation")

    df_filtered = df.filter(
        (col("is_valid") != True)
    ).withColumn("JSON_File_Name", regexp_replace(col("JSON_File_Name"), "/JSON/", "/INVALID_JSON/"))

    # Repartition to optimize parallelism
    repartitioned_df = df_filtered.repartition(64)

    df_with_upload_status = repartitioned_df.filter(~col("JSON_content").like("Error%")).withColumn(
            "Status", upload_udf(col("JSON_File_Name"), col("JSON_content"))
        )

    return df_with_upload_status.select("CaseNo", "JSON_content",col("JSON_File_Name").alias("File_Name"),"Status") 


In [0]:
import dlt
from pyspark.sql.functions import col, lit, expr

@dlt.table(
    name=f"apl_active_{output_name}_cr_audit_table",
    comment="DLT table Covers 4.2 Silver layer LLD requirements: Audits CCD attributes, input field values, derived values, and all columns for validation and traceability.",
    path=f"{audit_path}/apl_active_{output_name}_cr_audit_table"
)
def apl_active_reasonsForAppealSubmitted_cr_audit_table():
    try:
        silver_m1 = dlt.read("silver_appealcase_detail").filter(col("dv_targetState") == lit(AppealState)).distinct()
        silver_m2 = dlt.read("silver_caseapplicant_detail").filter(col("dv_targetState") == lit(AppealState))
        bronze_appealtype_lookup_df = dlt.read("bronze_appealtype").distinct()
        bronze_hearing_centres_lookup_df = dlt.read("bronze_hearing_centres").distinct()
        silver_m3 = dlt.read("silver_status_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_c = dlt.read("ariadm_active_appeals.silver_appealcategory_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_h = dlt.read("silver_history_detail").filter(col("dv_targetState") == lit(AppealState))
        bronze_remission_lookup_df = dlt.read("bronze_remissions").distinct()
        bronze_remissions_lookup_df = dlt.read("bronze_remissions").distinct()
        bronze_countryFromAddress = dlt.read("bronze_countries_countryFromAddress")
        bronze_HORef_cleansing = dlt.read("bronze_HORef_cleansing")
        bronze_hearing_centres = dlt.read("bronze_hearing_centres")
        bronze_derive_hearing_centres = dlt.read("bronze_derive_hearing_centres")
        silver_segmentation = dlt.read("stg_segmentation_states").filter(col("TargetState") == lit(AppealState))
      
    except:
        silver_m1 = spark.table("ariadm_active_appeals.silver_appealcase_detail").filter(col("dv_targetState") == lit(AppealState)).distinct()
        silver_m2 = spark.table("ariadm_active_appeals.silver_caseapplicant_detail").filter(col("dv_targetState") == lit(AppealState))
        bronze_appealtype_lookup_df = spark.table("ariadm_active_appeals.bronze_appealtype").distinct()
        bronze_hearing_centres_lookup_df = spark.table("ariadm_active_appeals.bronze_hearing_centres").distinct()
        silver_m3 = spark.table("ariadm_active_appeals.silver_status_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_c = spark.table("ariadm_active_appeals.silver_appealcategory_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_h = spark.table("ariadm_active_appeals.silver_history_detail").filter(col("dv_targetState") == lit(AppealState))
        bronze_remissions_lookup_df = spark.table("ariadm_active_appeals.bronze_remissions").distinct()
        bronze_countryFromAddress = spark.table("ariadm_active_appeals.bronze_countries_countryFromAddress")
        bronze_HORef_cleansing = spark.table("ariadm_active_appeals.bronze_HORef_cleansing")
        bronze_hearing_centres = spark.table("ariadm_active_appeals.bronze_hearing_centres")
        bronze_derive_hearing_centres = spark.table("ariadm_active_appeals.bronze_derive_hearing_centres")
        silver_segmentation = spark.table("ariadm_active_appeals.stg_segmentation_states").filter(col("TargetState") == lit(AppealState))

 
    df_final,df_audit = mainPaymentPending(silver_segmentation, silver_m1, silver_m2, silver_m3, silver_c,silver_h, bronze_remissions_lookup_df, bronze_countryFromAddress, bronze_HORef_cleansing,bronze_hearing_centres,bronze_derive_hearing_centres)

    return df_audit

In [0]:
dbutils.notebook.exit("Notebook completed successfully")