# Active Appeals CCD MVP Payment Pending (Silver Layer)
<table style='float:left;'>
   <tbody>
      <tr>
         <td style='text-align: left;'><b>Name: </b></td>
         <td>SILVER_TO_GOLD__PAYMENT_PENDING_JSON</td>
      </tr>
      <tr>
         <td style='text-align: left;'><b>Description: </b></td>
         <td>Notebook dedicated for the payment pending, not common for any other active appeal states.</td>
      </tr>
      <tr>
         <td style='text-align: left;'><b>First Created: </b></td>
         <td>July-2025</td>
      </tr>
      <tr>
         <th style='text-align: left;'><b>Changelog (JIRA ref/initials/date):</b></th>
         <th>Comments</th>
      </tr>
      <tr>
         <td style='text-align: left;'><a href="https://tools.hmcts.net/jira/browse/ARIADM-667">ARIADM-667</a>/NSA/JUL-2025</td>
         <td>Create Silver Staging tables: stg_main_payment_pending_validation, stg_valid_payment_pending_records, stg_invalid_payment_pending_quarantine_records</td>
      </tr>
      <tr>
         <td style='text-align: left;'><a href="https://tools.hmcts.net/jira/browse/ARIADM-668">ARIADM-668</a>/NSA/JUL-2025</td>
         <td>appealType 1:1 & defaults mappings</td>
      </tr>
      <tr>
         <td style='text-align: left;'><a href="https://tools.hmcts.net/jira/browse/ARIADM-670">ARIADM-670</a>/NSA/JUL-2025</td>
         <td>appealType logic mappings</td>
      </tr>
      <tr>
         <td style='text-align: left;'><a href="https://tools.hmcts.net/jira/browse/ARIADM-672">ARIADM-672</a>/NSA/JUL-2025</td>
         <td>caseData 1:1 & defaults mappings</td>
      </tr>
       <tr>
         <td style='text-align: left;'><a href="https://tools.hmcts.net/jira/browse/ARIADM-669">ARIADM-669</a>/NSA/JUL-2025</td>
         <td>appealType 1:1 - Data quality & constriant checks implementation</td>
      </tr>
       <tr>
         <td style='text-align: left;'><a href="https://tools.hmcts.net/jira/browse/ARIADM-671">ARIADM-671</a>/NSA/JUL-2025</td>
         <td>appealType logic - Data quality & constriant checks implementation</td>
      </tr>
      <tr>
         <td style='text-align: left;'><a href="https://tools.hmcts.net/jira/browse/ARIADM-707">ARIADM-707</a>/NSA/JUL-2025</td>
         <td>caseData logic mappings - Hearing Centre</td>
      </tr>
      <tr>
         <td style='text-align: left;'><a href="https://tools.hmcts.net/jira/browse/ARIADM-674">ARIADM-674</a>/NSA/JUL-2025</td>
         <td>caseData logic mappings</td>
      </tr>
      <tr>
         <td style='text-align: left;'><a href="https://tools.hmcts.net/jira/browse/ARIADM-673">ARIADM-673</a>/NSA/JUL-2025</td>
         <td>caseData 1:1 - Data quality & constriant checks implementation</td>
      </tr>
      <tr>
         <td style='text-align: left;'><a href="https://tools.hmcts.net/jira/browse/ARIADM-675">ARIADM-675</a>/NSA/JUL-2025</td>
         <td>caseData logic mappings - Data quality & constriant checks implementation</td>
      </tr>
      <tr>
         <td style='text-align: left;'><a href="https://tools.hmcts.net/jira/browse/ARIADM-707">ARIADM-707</a>/NSA/JUL-2025</td>
         <td>caseData logic mappings - Hearing Centre</td>
      </tr>
      <tr>
         <td style='text-align: left;'><a href="https://tools.hmcts.net/jira/browse/ARIADM-708">ARIADM-708</a>/NSA/JUL-2025</td>
         <td>caseData logic mappings - Hearing Centre - Data quality & constriant checks implementation</td>
      </tr>
   </tbody>
</table>

### Import packages

In [0]:
import dlt
import json
# from pyspark.sql.functions import when, col,coalesce, current_timestamp, lit, date_format
from pyspark.sql.functions import *

In [0]:
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()

print(f"env_code: {lz_key}")  # This won't be redacted
print(f"env_name: {env_name}")  # This won't be redacted

KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
print(f"KeyVault_name: {KeyVault_name}") 

In [0]:
# Service principal credentials
client_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-TENANT-ID")

# Storage account names
curated_storage = f"ingest{lz_key}curated{env_name}"
checkpoint_storage = f"ingest{lz_key}xcutting{env_name}"
raw_storage = f"ingest{lz_key}raw{env_name}"
landing_storage = f"ingest{lz_key}landing{env_name}"
external_storage = f"ingest{lz_key}external{env_name}"


# Spark config for curated storage (Delta table)
spark.conf.set(f"fs.azure.account.auth.type.{curated_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{curated_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{curated_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{curated_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{curated_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{checkpoint_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{checkpoint_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{checkpoint_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{checkpoint_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{checkpoint_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{raw_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{raw_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{raw_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{raw_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{raw_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{landing_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{landing_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{landing_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{landing_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{landing_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")


# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{external_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{external_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{external_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{external_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{external_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

## Set Variables

In [0]:
# read_hive = False

# Setting variables for use in subsequent cells
bronze_path = f"abfss://bronze@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS"
silver_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS"
gold_path = f"abfss://gold@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS"
gold_outputs = "ARIADM/CCD/APPEALS"
# hive_schema = "ariadm_ccd_apl"
# key_vault = "ingest00-keyvault-sbox"
AppealState = "paymentPending"


# Print all variables
variables = {
    # "read_hive": read_hive,
    
    "bronze_path": bronze_path,
    "silver_path": silver_path,
    "gold_path": gold_path,
    # "html_path": html_path,
    "gold_outputs": gold_outputs,
    # "hive_schema": hive_schema,
    "key_vault": KeyVault_name,
    "AppealState": AppealState

}

display(variables)

## PaymentPending: Silver DLT staging table for gold transformation

## JSON Schemas

In [0]:
from pyspark.sql.functions import col, from_json, trim, regexp_replace
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

# Define the schema of the JSON structure in caseManagementCategory
caseManagementCategory_json_schema = StructType([
    StructField("value", StructType([
        StructField("code", StringType(), True),
        StructField("label", StringType(), True)
    ]), True),
    StructField("list_items", ArrayType(
        StructType([
            StructField("code", StringType(), True),
            StructField("label", StringType(), True)
        ])
    ), True)
])

## Transformation Functions

In [0]:
from pyspark.sql.functions import col, when, lit

# AppealType grouping
# Note: AppealType and other columns have been mapped with logic in m1 silver.
# Additional Notes - All these mappings use logic held in the tab 'APPENDIX-AppealType'
# Using a case statement - refer to appendix for logic.
def appealType(silver_m1):
    conditions = (col("dv_representation").isin('LR', 'AIP')) & (col("lu_appealType").isNotNull())


    df = silver_m1.select(
        col("CaseNo"),
        when(
            conditions,
            col("lu_appealType")
        ).otherwise(None).alias("appealType"),
        when(
            conditions,
            col("lu_hmctsCaseCategory")
        ).otherwise(None).alias("hmctsCaseCategory"),
        when(
            conditions,
            col("CaseNo")
        ).otherwise(None).alias("appealReferenceNumber"),
        when(
            conditions,
            col("lu_appealTypeDescription")
        ).otherwise(None).alias("appealTypeDescription"),
        when(
            ((col("dv_representation").isin('LR')) & (col("lu_appealType").isNotNull())),
            col("lu_caseManagementCategory")
        ).otherwise(lit(None)).alias("caseManagementCategory"),
        when(
            ((col("dv_representation").isin('AIP')) & (col("lu_appealType").isNotNull())),
            lit("Yes")
        ).otherwise(lit(None)).alias("isAppealReferenceNumberAvailable"),
        when(
            conditions,
            lit("")
        ).otherwise(lit(None)).alias("ccdReferenceNumberForDisplay")
    )

    return df

# AppealState = "paymentPending"
# silver_m1 = spark.table("ariadm_active_appeals.silver_appealcase_detail").filter(col("dv_targetState") == lit(AppealState)).distinct()
# df = appealType(silver_m1)
# display(df)

In [0]:
from pyspark.sql.functions import collect_list, struct, when, lit, col, max as spark_max, date_format, row_number
from pyspark.sql.window import Window

# caseData grouping
def caseData(silver_m1, silver_m3, silver_m5):
    # silver_m1 = silver_m1.filter( ((col("representation").isin('LR', 'AIP')) & (col("appealType").isNotNull())))

    # Filter silver_m3 to get rows with max StatusId and Outcome is not null
    # Define window partitioned by CaseNo and ordered by descending StatusId
    window_spec = Window.partitionBy("CaseNo").orderBy(col("StatusId").desc())

    # Add row_number to get the row with the highest StatusId per CaseNo
    silver_m3_ranked = silver_m3.withColumn("row_num", row_number().over(window_spec))

    # Filter the top-ranked rows where Outcome is not null
    silver_m3_filtered = silver_m3_ranked.filter(
        (col("row_num") == 1) & (col("Outcome").isNotNull())
    ).select(
        col("CaseNo"),
        lit("Yes").alias("recordedOutOfTimeDecision"), col("Outcome")
    )

    conditions = (col("dv_representation").isin('LR', 'AIP')) & (col("lu_appealType").isNotNull())

    df = silver_m1.alias("m1").join(
        silver_m3_filtered.alias("m3"),
        on="CaseNo",
        how="left"
    ).withColumn(
        "appellantsRepresentation", when(((col("m1.dv_representation") == "LR") &  (col("lu_appealType").isNotNull())), "No").when(((col("m1.dv_representation") == "AIP") & (col("lu_appealType").isNotNull())), "Yes").otherwise(None)
    ).withColumn(
        "submissionOutOfTime", when(col("OutOfTimeIssue") == 1, lit("Yes")).otherwise(lit("No"))
    ).withColumn(
        "adminDeclaration1", lit(["hasDeclared"])
    ).withColumn(
        "appealWasNotSubmittedReason", when(((col("m1.dv_representation") == "LR") & (col("lu_appealType").isNotNull())), "This is an ARIA Migrated Case.").otherwise(None)
    ).withColumn(
        "applicationOutOfTimeExplanation", when(col("OutOfTimeIssue") == 1, "This is a migrated ARIA case. Please refer to the documents.").otherwise(None)
    ).withColumn(
        "appealSubmissionDate", date_format(col("m1.DateLodged"), "yyyy-MM-dd'T'HH:mm:ss'Z'")
    ).withColumn(
        "appealSubmissionInternalDate", date_format(col("m1.DateLodged"), "yyyy-MM-dd'T'HH:mm:ss'Z'")
    ).withColumn(
        "tribunalReceivedDate", date_format(col("m1.DateAppealReceived"), "yyyy-MM-dd'T'HH:mm:ss'Z'")
    ).select(
        "CaseNo", 
        col("appellantsRepresentation"),
        when(conditions, col("submissionOutOfTime")).otherwise(None).alias("submissionOutOfTime"),
        when(conditions, col("m3.recordedOutOfTimeDecision")).otherwise(None).alias("recordedOutOfTimeDecision"),
        when(conditions, col("applicationOutOfTimeExplanation")).otherwise(None).alias("applicationOutOfTimeExplanation"), 
        when(conditions, col("lu_hearingCentre")).otherwise(None).alias("hearingCentre"),
        when(conditions, col("lu_staffLocation")).otherwise(None).alias("staffLocation"),
        when(conditions, col("lu_caseManagementLocation")).otherwise(None).alias("caseManagementLocation"),
        when(conditions, col("dv_hearingCentreDynamicList")).otherwise(None).alias("hearingCentreDynamicList"),
        when(conditions, col("dv_caseManagementLocationRefData")).otherwise(None).alias("caseManagementLocationRefData"),
        when(conditions, col("lu_selectedHearingCentreRefData")).otherwise(None).alias("selectedHearingCentreRefData"),
        col("appealWasNotSubmittedReason"),
        when(conditions, col("adminDeclaration1")).otherwise(None).alias("adminDeclaration1"),    
        when(conditions, col("appealSubmissionDate")).otherwise(None).alias("appealSubmissionDate"), 
        when(conditions, col("appealSubmissionInternalDate")).otherwise(None).alias("appealSubmissionInternalDate"),
        when(conditions, col("tribunalReceivedDate")).otherwise(None).alias("tribunalReceivedDate"),
        when(conditions, lit([]).cast("array<int>")).otherwise(None).alias("caseLinks"), 
        when(conditions, lit("No")).otherwise(None).alias("hasOtherAppeals")
        
        
    )

    return df

# AppealState = "paymentPending"
# silver_m5 = spark.table("ariadm_active_appeals.silver_link_detail").filter(col("dv_targetState") == lit(AppealState))
# silver_m1 = spark.table("ariadm_active_appeals.silver_appealcase_detail").filter(col("dv_targetState") == lit(AppealState)).distinct()
# silver_m3 = spark.table("ariadm_active_appeals.silver_status_detail").filter(col("dv_targetState") == lit(AppealState))
# df = caseData(silver_m1, silver_m3, silver_m5)

In [0]:
# display(df)

In [0]:
from pyspark.sql.types import StringType, StructType, ArrayType, MapType

def mainPaymentPending(silver_m1, silver_m3,silver_m5):

    AppealState = "paymentPending"

    # Aggregate details
    AppealType_df = appealType(silver_m1)
    # grouped_transaction = TransactionDetails(silver_m4)
    caseData_df = caseData( silver_m1, silver_m3,silver_m5)

    # Join all aggregated data with Appeal Case Details
    df_combined = AppealType_df.join(caseData_df, on="CaseNo", how="left")
    

    # Create JSON and filename and OMit columns that are with null values
    df_final = df_combined.withColumn(
        "JSON_Content", to_json(struct(*df_combined.drop(col("CaseNo")).columns))
    ).withColumn(
        "JSON_File_name", concat(lit(f"{gold_outputs}/JSON/APPEALS_"), regexp_replace(col("CaseNo"), "/", "_"), lit(".json"))
    )
    
    return df_final

# AppealState = "paymentPending"
# silver_m1 = spark.table("ariadm_active_appeals.silver_appealcase_detail").filter(col("dv_targetState") == lit(AppealState)).distinct()
# silver_m3 = spark.table("ariadm_active_appeals.silver_status_detail").filter(col("dv_targetState") == lit(AppealState))
# silver_m5 = spark.table("ariadm_active_appeals.silver_link_detail").filter(col("dv_targetState") == lit(AppealState))

# df_final = mainPaymentPending(silver_m1, silver_m3,silver_m5)
# display(df_final.select("*"))

### Upload Function and Blob Client Connection Configuration

In [0]:
secret = dbutils.secrets.get(KeyVault_name, "CURATED-sbox-SAS-TOKEN")

In [0]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import os

# Set up the BlobServiceClient with your connection string
connection_string = secret

blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Specify the container name
container_name = "gold"
container_client = blob_service_client.get_container_client(container_name)

In [0]:
# Upload HTML to Azure Blob Storage
def upload_to_blob(file_name, file_content):
    try:
        # blob_client = container_client.get_blob_client(f"{gold_outputs}/HTML/{file_name}")
        blob_client = container_client.get_blob_client(f"{file_name}")
        blob_client.upload_blob(file_content, overwrite=True)
        return "success"
    except Exception as e:
        return f"error: {str(e)}"

# Register the upload function as a UDF
upload_udf = udf(upload_to_blob)

# df_with_upload_status = df_final.withColumn(
#     "Status", upload_udf(col("JSON_File_name"), col("JSON_Content"))
# )

# display(df_with_upload_status)


## Gold Outputs and Tracking DLT Table Creation

![DQValidation.png](./Images/DQValidation.png "DQValidation.png")

In [0]:
# Define a dictionary to hold data quality checks
checks = {}


checks["valid_appealtype_exists"] = "(AppealType IN ('refusalOfHumanRights', 'refusalOfEu', 'deprivation', 'protection', 'revocationOfProtection', 'euSettlementScheme'))"
checks["valid_hmctsCaseCategory_exists"] = "(hmctsCaseCategory IS NOT NULL)"
checks["valid_appealTypeDescription_exists"] = "(appealTypeDescription IS NOT NULL)"
# Null Values as accepted values as where Representation = AIP
# checks["valid_caseManagementCategory_exists"] = """
#     ((valid_caseManagementCategory IS NULL AND valid_representation = 'AIP') OR 
#     (valid_caseManagementCategory IS NOT NULL AND valid_representation = 'LR'))
# """

checks["valid_caseManagementCategory_code_in_list_items"] = """
  caseManagementCategory.value.code IS NULL OR
  ARRAY_CONTAINS(
    TRANSFORM(caseManagementCategory.list_items, x -> x.code),
    caseManagementCategory.value.code
  )
"""

checks["valid_caseManagementCategory_label_in_list_items"] = """
  caseManagementCategory.value.label IS NULL OR
  ARRAY_CONTAINS(
    TRANSFORM(caseManagementCategory.list_items, x -> x.label),
    caseManagementCategory.value.label
  )
"""



checks["valid_appealReferenceNumber_exists"] = "(appealReferenceNumber IS NOT NULL)"
checks["valid_isappealreferencenumberavailable_exists"] = """
    ((isappealreferencenumberavailable = 'Yes' AND valid_representation = 'AIP') OR     
    (isappealreferencenumberavailable IS NULL AND valid_representation = 'LR'))
"""
checks["valid_caseLinks_exists"] = "(size(caseLinks) = 0)"
checks["valid_hasOtherAppeals"] = "(hasOtherAppeals = 'No')"


# CaseData # ARIADM-675
checks["valid_appellantsRepresentation"] = "(appellantsRepresentation IS NOT NULL AND appellantsRepresentation IN ('Yes', 'No'))"
checks["valid_submissionOutOfTime"] = "(submissionOutOfTime IS NOT NULL AND submissionOutOfTime IN ('Yes', 'No'))"
checks["valid_recordedOutOfTimeDecision"] = "(recordedOutOfTimeDecision IS NULL OR recordedOutOfTimeDecision IN ('Yes', 'No'))"
checks["valid_applicationOutOfTimeExplanation"] = "(applicationOutOfTimeExplanation IS NULL OR applicationOutOfTimeExplanation IN ('Yes', 'No'))"

#CaseData Part02
# checks["valid_hearingCentre"] = """
#     (hearingCentre IN ('taylorHouse', 'newport', 'newcastle', 'manchester', 'hattonCross', 
#     'glasgow', 'bradford', 'birmingham', 'arnhemHouse', 'crownHouse', 'harmondsworth', 
#     'yarlsWood', 'remoteHearing', 'decisionWithoutHearing'))
# """
# checks["valid_staffLocation_exists"] = "(staffLocation IS NOT NULL)"
# checks["valid_caseManagementLocation_exists"] = "(valid_caseManagementLocation IS NOT NULL)"

# DynamicList does not seem to be applicable
# checks["valid_hearingCentreDynamicList_exists"] = "(valid_hearingCentreDynamicList IS NOT NULL)"
# checks["valid_caseManagementLocationRefData_exists"] = "(valid_caseManagementLocationRefData IS NOT NULL)"
# checks["valid_selectedHearingCentreRefData_exists"] = "(selectedHearingCentreRefData IS NOT NULL)"
# checks["valid_applicationOutOfTimeExplanation"] = "(applicationOutOfTimeExplanation IS NOT NULL)"

# \d is a regular expression (regex) metacharacter that matches any single digit from 0 to 9.
checks["valid_appealSubmissionDate_format"] = ( "appealSubmissionDate RLIKE r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$'" )
checks["valid_appealSubmissionInternalDate_format"] = ( "appealSubmissionInternalDate RLIKE r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$'" )
checks["valid_tribunalReceivedDate_format"] = ( "tribunalReceivedDate RLIKE r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$'" )


# checks["ccd_reference_number_for_display_exists"] = "(ccdReferenceNumberForDisplay IS NOT NULL)"

# Combine all checks into a single string with AND conditions
# Create a validation expression to quarantine records
dq_rules = "({0})".format(" AND ".join(checks.values()))

In [0]:
# print(dq_rules)

In [0]:
# %sql
# SELECT 
#   CaseNo,
#   appealSubmissionDate,
#   submissionOutOfTime,
#   valid_caseManagementCategory,
#   caseManagementCategory
# FROM ariadm_active_appeals.stg_main_payment_pending_validation
# WHERE AppealType IN (
#     'refusalOfHumanRights', 'refusalOfEu', 'deprivation', 'protection', 
#     'revocationOfProtection', 'euSettlementScheme'
# )
# AND appealReferenceNumber IS NOT NULL
# AND hmctsCaseCategory IS NOT NULL
# AND appealTypeDescription IS NOT NULL
# AND (
#     (valid_caseManagementCategory IS NULL AND valid_representation = 'AIP')
#     OR (valid_caseManagementCategory IS NOT NULL AND valid_representation = 'LR')
# )
# AND (
#     (isappealreferencenumberavailable = 'Yes' AND valid_representation = 'AIP') 
#     OR (isappealreferencenumberavailable IS NULL AND valid_representation = 'LR')
# )
# AND size(caseLinks) = 0
# AND hasOtherAppeals = 'No'
# AND (submissionOutOfTime IS NOT NULL AND submissionOutOfTime IN ('Yes', 'No'))
# AND (recordedOutOfTimeDecision IS NULL OR recordedOutOfTimeDecision IN ('Yes', 'No'))
# AND appealSubmissionDate RLIKE r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$'
# AND appellantsRepresentation IS NOT NULL AND appellantsRepresentation IN ('Yes', 'No')
# AND (applicationOutOfTimeExplanation IS NULL OR applicationOutOfTimeExplanation IN ('Yes', 'No'))
# AND (
#   caseManagementCategory.value.code IS NULL OR
#   ARRAY_CONTAINS(
#     TRANSFORM(caseManagementCategory.list_items, x -> x.code),
#     caseManagementCategory.value.code
#   )
# )
# -- AND hearingCentre IN (
# --     'taylorHouse', 'newport', 'newcastle', 'manchester', 'hattonCross', 
# --     'glasgow', 'bradford', 'birmingham', 'arnhemHouse', 'crownHouse', 
# --     'harmondsworth', 'yarlsWood', 'remoteHearing', 'decisionWithoutHearing'
# -- )
# -- AND staffLocation IS NOT NULL
# -- AND valid_caseManagementLocation IS NOT NULL
# -- AND valid_hearingCentreDynamicList IS NOT NULL
# -- AND valid_caseManagementLocationRefData IS NOT NULL
# -- AND selectedHearingCentreRefData IS NOT NULL
# AND appealSubmissionInternalDate RLIKE r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$'
# AND tribunalReceivedDate RLIKE r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$'

In [0]:
# %sql
# select CaseNo, caseManagementCategory,JSON_Content,valid_representation  from ariadm_active_appeals.stg_main_payment_pending_validation

In [0]:
import dlt
from pyspark.sql.functions import col, lit, expr

@dlt.table(
    name="stg_main_payment_pending_validation",
    comment="DLT table running mainPaymentPending to generate a JSON_Content column for CCD validation. Applies DLT expectations on CCD, adding is_valid to flag validation results.",
    path=f"{silver_path}/stg_main_payment_pending_validation"
)
@dlt.expect_all(checks)
def stg_main_payment_pending_validation():
    try:
        silver_m1 = dlt.read("silver_appealcase_detail").filter(col("dv_targetState") == lit(AppealState)).distinct()
        bronze_appealtype_lookup_df = dlt.read("bronze_appealtype").distinct()
        bronze_hearing_centres_lookup_df = dlt.read("bronze_hearing_centres").distinct()
        # stg_representation = dlt.read("stg_representation").select(col("Representation").alias("valid_representation"))
        # silver_m2 = dlt.read("silver_caseapplicant_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_m3 = dlt.read("silver_status_detail").filter(col("dv_targetState") == lit(AppealState))
        # silver_m4 = dlt.read("silver_transaction_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_m5 = dlt.read("silver_link_detail").filter(col("dv_targetState") == lit(AppealState))
        # silver_m6 = dlt.read("silver_adjudicator_detail").filter(col("dv_targetState") == lit(AppealState))
        # silver_m7 = dlt.read("silver_appealcategory_detail").filter(col("dv_targetState") == lit(AppealState))
        # silver_m8 = dlt.read("silver_documentsreceived_detail").filter(col("dv_targetState") == lit(AppealState))
        # silver_m9 = dlt.read("silver_history_detail").filter(col("dv_targetState") == lit(AppealState))
    except:
        silver_m1 = spark.table("ariadm_active_appeals.silver_appealcase_detail").filter(col("dv_targetState") == lit(AppealState)).distinct()
        bronze_appealtype_lookup_df = spark.table("ariadm_active_appeals.bronze_appealtype").distinct()
        bronze_hearing_centres_lookup_df = spark.table("ariadm_active_appeals.bronze_hearing_centres").distinct()
        # stg_representation = spark.table("ariadm_active_appeals.stg_representation").select(col("Representation").alias("valid_representation"))
        # silver_m2 = spark.table("ariadm_active_appeals.silver_caseapplicant_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_m3 = spark.table("ariadm_active_appeals.silver_status_detail").filter(col("dv_targetState") == lit(AppealState))
        # silver_m4 = spark.table("ariadm_active_appeals.silver_transaction_detail").filter(col("dv_targetState") == lit(AppealState))
        silver_m5 = spark.table("ariadm_active_appeals.silver_link_detail").filter(col("dv_targetState") == lit(AppealState))
        # silver_m6 = spark.table("ariadm_active_appeals.silver_adjudicator_detail").filter(col("dv_targetState") == lit(AppealState))
        # silver_m7 = spark.table("ariadm_active_appeals.silver_appealcategory_detail").filter(col("dv_targetState") == lit(AppealState))
        # silver_m8 = spark.table("ariadm_active_appeals.silver_documentsreceived_detail").filter(col("dv_targetState") == lit(AppealState))
        # silver_m9 = spark.table("ariadm_active_appeals.silver_history_detail").filter(col("dv_targetState") == lit(AppealState))

    df_final = mainPaymentPending(silver_m1, silver_m3, silver_m5)

    # Join with valid CaseNo and AppealType references
    df_joined = (
        df_final.alias("final")
        .join(
            bronze_appealtype_lookup_df.select(col("caseManagementCategory").alias("valid_caseManagementCategory")).alias("at_cmc"),
            col("final.caseManagementCategory") == col("at_cmc.valid_caseManagementCategory"),
            "left"
        )
        # .join(
        #     bronze_hearing_centres_lookup_df.select(col("caseManagementLocation").alias("valid_caseManagementLocation")).alias("hc_cml"),
        #     col("final.caseManagementLocation") == col("hc_cml.valid_caseManagementLocation"),
        #     "left"
        # )
        # .join(
        #     silver_m1.select(col("dv_hearingCentreDynamicList").alias("valid_hearingCentreDynamicList")).alias("m1_hcd"),
        #     col("final.hearingCentreDynamicList") == col("m1_hcd.valid_hearingCentreDynamicList"),
        #     "left"
        # )
        # .join(
        #     silver_m1.select(col("dv_caseManagementLocationRefData").alias("valid_caseManagementLocationRefData")).alias("m1_cmlr"),
        #     col("final.caseManagementLocationRefData") == col("m1_cmlr.valid_caseManagementLocationRefData"),
        #     "left"
        # )
        # .join(
        #     bronze_hearing_centres_lookup_df.select(col("selectedHearingCentreRefData").alias("valid_selectedHearingCentreRefData")).alias("hc_shcrd"),
        #     col("final.selectedHearingCentreRefData") == col("hc_shcrd.valid_selectedHearingCentreRefData"),
        #     "left"
        # )
        .join(
            silver_m1.select(col("dv_representation").alias("valid_representation"), col("CaseNo").alias("valid_CaseNo")).alias("m1"),
            col("final.CaseNo") == col("m1.valid_CaseNo"),
            "left"
        )
    )

    df_final = df_joined.withColumn("is_valid", expr(dq_rules))

    return df_final

In [0]:
@dlt.table(
    name="stg_valid_payment_pending_records",
    comment="Delta Live Gold Table with JSON content.",
    path=f"{silver_path}/stg_valid_payment_pending_records"
)
def stg_valid_payment_pending_records():
    """
    Delta Live Table for creating and uploading JSON content for Appeals.
    """
    # Load source data
    df = dlt.read("stg_main_payment_pending_validation")

    df_filtered = df.filter(
        (col("is_valid") == True)
    )

    # Repartition to optimize parallelism
    repartitioned_df = df_filtered.repartition(64)

    df_with_upload_status = repartitioned_df.filter(~col("JSON_content").like("Error%")).withColumn(
            "Status", upload_udf(col("JSON_File_Name"), col("JSON_content"))
        )

    # Return the DataFrame for DLT table creation
    return df_with_upload_status.select("CaseNo", "JSON_content",col("JSON_File_Name").alias("File_Name"),"Status")


In [0]:
@dlt.table(
    name="stg_invalid_payment_pending_quarantine_records",
    comment="Quarantined records that failed data quality checks or JSON generation.",
    path=f"{silver_path}/stg_invalid_payment_pending_quarantine_records"
)
def stg_invalid_payment_pending_quarantine_records():

    df = dlt.read("stg_main_payment_pending_validation")

    df_filtered = df.filter(
        (col("is_valid") != True)
    ).withColumn("JSON_File_Name", regexp_replace(col("JSON_File_Name"), "/JSON/", "/INVALID_JSON/"))

    # Repartition to optimize parallelism
    repartitioned_df = df_filtered.repartition(64)

    df_with_upload_status = repartitioned_df.filter(~col("JSON_content").like("Error%")).withColumn(
            "Status", upload_udf(col("JSON_File_Name"), col("JSON_content"))
        )

    return df_with_upload_status.select("CaseNo", "JSON_content",col("JSON_File_Name").alias("File_Name"),"Status") 


In [0]:
dbutils.notebook.exit("Notebook completed successfully")

## Appendix

### Validation

In [0]:
%sql
select * from ariadm_active_appeals.stg_valid_payment_pending_records

In [0]:
# %sql
# SHOW TABLES IN ariadm_active_appeals
# -- LIKE 'stg_%'

In [0]:
# %sql
# select  from ariadm_active_appeals.stg_main_payment_pending_validation

In [0]:
# ## Validation Framework

# from pyspark.sql.types import *

# def validate_AppealType():
  
#   test_df = AppealType(silver_m1)

#   expected_types = {
#     "appealReferenceNumber": StringType(),
#     "isAppealReferenceNumberAvailable": Stringtype()

#   }
      
#   nested_types = test_df.schema["appealType"].dataType

#   for field in nested_types.elementType.fields:
#     field_name = field.name
#     input_type = field.dataType
#     # print(f"Field: {field_name}, incoming type {input_type}")

#     expected_type = expected_types[field_name]

#     assert input_type == expected_type, f"Expected type {expected_type} but got {input_type}"
#     print(f"Succesfully validated type {expected_type} for field {field_name}")


In [0]:
# from pyspark.sql.types import StructType, StructField, StringType, BooleanType, TimestampType, IntegerType




In [0]:
# from datetime import datetime

# def validate_AppealType_content():
#     schema = StructType([
#         StructField("CaseNo", StringType(), True),
#         StructField("CasePrefix", StringType(), True),
#         StructField("OutOfTimeIssue", BooleanType(), True),
#         StructField("DateLodged", TimestampType(), True),
#         StructField("DateAppealReceived", TimestampType(), True),
#         StructField("CentreId", IntegerType(), True),
#         StructField("NationalityId", IntegerType(), True),
#         StructField("AppealTypeId", IntegerType(), True),
#         StructField("DeportationDate", TimestampType(), True),
#         StructField("RemovalDate", TimestampType(), True),
#         StructField("VisitVisaType", IntegerType(), True),
#         StructField("DateOfApplicationDecision", TimestampType(), True),
#         StructField("HORef", StringType(), True),
#         StructField("InCamera", BooleanType(), True),
#         StructField("CourtPreference", IntegerType(), True),
#         StructField("LanguageId", IntegerType(), True),
#         StructField("Interpreter", IntegerType(), True),
#         StructField("RepresentativeId", IntegerType(), True),
#         StructField("CaseRepName", StringType(), True)
#     ])
    
#     data = [
#         ("12345", "CP", True, datetime(2025, 6, 24, 0, 0, 0), datetime(2025, 6, 24, 0, 0, 0), 1, 1, 1, datetime(2025, 6, 24, 0, 0, 0), datetime(2025, 6, 24, 0, 0, 0), 1, datetime(2025, 6, 24, 0, 0, 0), "HO123", True, 1, 1, 1, 1, "RepName")
#     ]
    
#     df = spark.createDataFrame(data, schema)

#     appealtype_df = AppealType(df)
#     appealtype_df.display()

#     ### Some assert Content Test e.g appealReferenceNumber is in format first 3 characters are letters followed by/ followed by 3 numbers etc

# validate_AppealType_content()

In [0]:
# import json

# first_row = df_final.select("JSON_Content").first()
# json_content = first_row["JSON_Content"]
# parsed_json = json.loads(json_content)

# def validate_json_types(parsed_json):
#     expected_types = {
#         "appealReferenceNumber": str,
#         "appealType": list,
#         "Transactions": list
#     }
    
#     inner_expected_types = {
#         "Transactions": {
#             "transactionId": str,
#             "amount": float,
#             "date": str
#         },
#         "appealType": {
#             "appealReferenceNumber": str,
#             "isAppealReferenceNumberAvailable": str
#         }
#     }
    
#     for key, expected_type in expected_types.items():
#         if key in parsed_json:
#             if not isinstance(parsed_json[key], expected_type):
#                 print(f"Validation failed: Key '{key}' has incorrect type. Expected {expected_type}, got {type(parsed_json[key])}")
#                 raise TypeError(f"Key '{key}' has incorrect type. Expected {expected_type}, got {type(parsed_json[key])}")
#             else:
#                 print(f"Validation passed: Key '{key}' has correct type {expected_type}")
#                 if key in inner_expected_types:
#                     for item in parsed_json[key]:
#                         for inner_key, inner_expected_type in inner_expected_types[key].items():
#                             if inner_key in item:
#                                 if not isinstance(item[inner_key], inner_expected_type):
#                                     print(f"Validation failed: Key '{inner_key}' in '{key}' has incorrect type. Expected {inner_expected_type}, got {type(item[inner_key])}")
#                                     raise TypeError(f"Key '{inner_key}' in '{key}' has incorrect type. Expected {inner_expected_type}, got {type(item[inner_key])}")
#                                 else:
#                                     print(f"Validation passed: Key '{inner_key}' in '{key}' has correct type {inner_expected_type}")
#                                 if inner_key == "isAppealReferenceNumberAvailable" and item[inner_key] not in ["YES", "no"]:
#                                     print(f"Validation failed: Key '{inner_key}' in '{key}' has invalid value. Expected 'YES' or 'no', got {item[inner_key]}")
#                                     raise ValueError(f"Key '{inner_key}' in '{key}' has invalid value. Expected 'YES' or 'no', got {item[inner_key]}")

# validate_json_types(parsed_json)
# print(json.dumps(parsed_json, indent=4))

### Analysis

In [0]:
# df_final = main_paymentPending()
# display(df_final.select("*"))

In [0]:
# import json

# first_row = df_final.filter(df_final["CaseNo"] == "HU/00035/2017").select("JSON_Content").first()
# json_content = first_row["JSON_Content"]
# parsed_json = json.loads(json_content)
# display(parsed_json)

In [0]:
%sql
select * from 

In [0]:
# from pyspark.sql.functions import col, lit, when, coalesce, collect_list, struct

# AppealState = "paymentPending"  # Define AppealState variable

# silver_m1 = spark.table("ariadm_active_appeals.silver_appealcase_detail").filter(col("TargetState") == lit(AppealState)).distinct()

# df = silver_m1.select(
#     col("CaseNo"),
#     when(
#         (col("representation") == 'LR') | (col("representation") == 'AIP'),
#         coalesce(col("appealType"), lit(""))
#     ).otherwise("").alias("appealType"),
#     when(
#         (col("representation") == 'LR') | (col("representation") == 'AIP'),
#         coalesce(col("hmctsCaseCategory"), lit(""))
#     ).otherwise("").alias("hmctsCaseCategory"),
#     col("CaseNo").alias("appealReferenceNumber"),
#     when(
#         (col("representation") == 'LR') | (col("representation") == 'AIP'),
#         coalesce(col("appealTypeDescription"), lit(""))
#     ).otherwise("").alias("appealTypeDescription"),
#     when(
#         (col("representation") == 'LR'),
#         col("caseManagementCategory")
#     ).otherwise(lit(None)).alias("caseManagementCategory"),
#     lit("YES").alias("isAppealReferenceNumberAvailable"),
#     lit("").alias("ccdReferenceNumberForDisplay")
# )

# df = df.groupBy(col("CaseNo")).agg(
#     collect_list(
#         struct(
#             'appealType', 'appealReferenceNumber', 'hmctsCaseCategory', 'appealTypeDescription', 'caseManagementCategory', 'isAppealReferenceNumberAvailable','ccdReferenceNumberForDisplay'
#         )
#     ).alias("appealType")
#     )

# # .withColumn("caseManagementCategory", 
# #     expr("""
# #     struct(
# #         struct(
# #             caseManagementCategory as code,
# #             caseManagementCategory as label
# #         ) as value,
# #         array(
# #             struct(
# #                 caseManagementCategory as code,
# #                 caseManagementCategory as label
# #             )
# #         ) as list_items
# #     )""")

# display(df)

In [0]:
# from pyspark.sql.functions import col, count

# # Reading tables into DataFrames and labeling as M1 to M9
# M1 = spark.table("ariadm_active_appeals.silver_appealcase_detail").distinct()
# M2 = spark.table("ariadm_active_appeals.silver_caseapplicant_detail")
# M3 = spark.table("ariadm_active_appeals.silver_status_detail")
# M4 = spark.table("ariadm_active_appeals.silver_transaction_detail")
# M5 = spark.table("ariadm_active_appeals.silver_link_detail")
# M6 = spark.table("ariadm_active_appeals.silver_adjudicator_detail")
# M7 = spark.table("ariadm_active_appeals.silver_appealcategory_detail")
# M8 = spark.table("ariadm_active_appeals.silver_documentsreceived_detail")
# M9 = spark.table("ariadm_active_appeals.silver_history_detail")

# # Function to check for duplicates
# def check_duplicates(df, table_name):
#     duplicates = df.groupBy("caseno").agg(count("*").alias("count")).filter(col("count") > 1)
#     if duplicates.count() > 0:
#         displayHTML(f"<span style='color:red;'>&#x274C; Table {table_name} has duplicates.</span>")
#     else:
#         displayHTML(f"<span style='color:green;'>&#x2705; Table {table_name} has no duplicates.</span>")

# # Check for duplicates in each table
# check_duplicates(M1, "silver_appealcase_detail")
# check_duplicates(M2, "silver_caseapplicant_detail")
# check_duplicates(M3, "silver_status_detail")
# check_duplicates(M4, "silver_transaction_detail")
# check_duplicates(M5, "silver_link_detail")
# check_duplicates(M6, "silver_adjudicator_detail")
# check_duplicates(M7, "silver_appealcategory_detail")
# check_duplicates(M8, "silver_documentsreceived_detail")
# check_duplicates(M9, "silver_history_detail")