In [0]:
#Load Config and Setup Enviorment Variables
state_under_test = "paymentPending"
from pyspark.sql import functions as F


config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()
 
# print(f"env_code: {lz_key}")  # This won't be redacted
# print(f"env_name: {env_name}")  # This won't be redacted
 
KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
# print(f"KeyVault_name: {KeyVault_name}")
 
# Service principal credentials
client_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-TENANT-ID")
 
# Storage account names
curated_storage = f"ingest{lz_key}curated{env_name}"
checkpoint_storage = f"ingest{lz_key}xcutting{env_name}"
raw_storage = f"ingest{lz_key}raw{env_name}"
landing_storage = f"ingest{lz_key}landing{env_name}"
external_storage = f"ingest{lz_key}external{env_name}"
  
# Spark config for curated storage (Delta table)
spark.conf.set(f"fs.azure.account.auth.type.{curated_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{curated_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{curated_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{curated_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{curated_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{checkpoint_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{checkpoint_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{checkpoint_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{checkpoint_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{checkpoint_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{raw_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{raw_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{raw_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{raw_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{raw_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{landing_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{landing_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{landing_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{landing_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{landing_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{external_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{external_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{external_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{external_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{external_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
  
# Setting variables for use in subsequent cells
bronze_path = f"abfss://bronze@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/"
silver_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/"
audit_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/AUDIT/{state_under_test}"
gold_path = f"abfss://gold@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{state_under_test}"
 
 
# Print all variables
# variables = {
#     # "read_hive": read_hive,
    
#     "bronze_path": bronze_path,
#     "silver_path": silver_path,
#     "audit_path": audit_path,
#     "gold_path": gold_path,
#     "key_vault": KeyVault_name,
#     "AppealState": state_under_test
 
# }
 
# display(variables)

import json

#Get Latest Json Folder
json_location = dbutils.fs.ls(f"{gold_path}/")[-1]
latest_json_location = json_location.name
dbutils.fs.ls(f"{gold_path}/{latest_json_location}")

#Set Paths
try: 
    json_path = f"{gold_path}/{latest_json_location}/JSON/"
    # json_path = f"{gold_path}/{latest_json_location}/INVALID_JSON/"
    M1_silver = f"{silver_path}/silver_appealcase_detail"
    M1_bronze = f"{bronze_path}/bronze_appealcase_crep_rep_floc_cspon_cfs"
    M2_silver = f"{silver_path}/silver_caseapplicant_detail"
    M3_silver = f"{silver_path}/silver_status_detail"
    C = f"{silver_path}/silver_appealcategory_detail"
    bhc = f"{bronze_path}/bronze_hearing_centres"
    bat = f"{bronze_path}/bronze_appealtype" 
    docsr = f"{bronze_path}/bronze_documentsreceived"   
    apl_audit = f"{audit_path}/apl_active_payment_pending_cr_audit_table/"
    sh =  f"{silver_path}/silver_history_detail"
except:
    print(f"Error during fetch: {str(e)}")

#Create and Load Dataframes
json_data = spark.read.format("json").load(json_path)
M1_silver = spark.read.format("delta").load(M1_silver)
M1_bronze = spark.read.format("delta").load(M1_bronze)
M2_silver = spark.read.format("delta").load(M2_silver)
M3_silver = spark.read.format("delta").load(M3_silver)
C = spark.read.format("delta").load(C)
bhc = spark.read.format("delta").load(bhc)
bat = spark.read.format("delta").load(bat)
docsr = spark.read.format("delta").load(docsr)
apl_audit = spark.read.format("delta").load(apl_audit)
sh_audit = spark.read.format("delta").load(sh)

#Can be removed later, added to allow developing of code in this notebook to begin with before moving to func files
from pyspark.sql.functions import (
    col, when, lit, array, struct, collect_list, 
    max as spark_max, date_format, row_number, expr, 
    size, udf, coalesce, concat_ws, concat, trim, year, split, datediff,
    collect_set, current_timestamp,transform, first, array_contains
)

In [0]:
###############################
#UPDATE BRONZE DATA SCRIPT FOR PAYMENT PENDING.
#
#NOTE: The below code will update bronze data that will not pass the DQ expecation checks due to
#issues in the data that will be resolved before live but are needeed to get all the data through the checks
#and sent to CCD in the mean time
###############################
from pyspark.sql.functions import *
from delta.tables import DeltaTable

######################
#TO FIX PP DATA FROM FIRST STAGING DATA CUT (now superseeded due to new data cut)
######################
# BirthDate / appellantDateOfBirth

# bronze_table = DeltaTable.forName(spark,"ariadm_active_appeals.bronze_appealcase_caseappellant_appellant")

# display(bronze_table.toDF().filter(col("CaseNo").isin("HU/00278/2025", "HU/00455/2025", "HU/00472/2025" )).select("CaseNo", "BirthDate"))

# bronze_table.update(
#     condition=col("CaseNo").isin("HU/00278/2025", "HU/00455/2025", "HU/00472/2025"),
#     set={"BirthDate": lit("2000-02-01T00:00:00Z")}
# )

# display(bronze_table.toDF().filter(col("CaseNo").isin("HU/00278/2025", "HU/00455/2025", "HU/00472/2025" )).select("CaseNo", "BirthDate"))


# #################
# #valid_appellantNationalitiesDescription_not_null
# #and
# #valid_appellantNationalities_not_null
# #NationalityId
# #Where No mapping required for 201/203

# bronze_table = DeltaTable.forName(spark, "ariadm_active_appeals.bronze_appealcase_crep_rep_floc_cspon_cfs")

# display(bronze_table.toDF().filter(col("CaseNo").isin("HU/00302/2025", "HU/00569/2025", "HU/00586/2025","HU/00560/2025" )).select("CaseNo", "NationalityId"))

# bronze_table.update(
#     condition=col("CaseNo").isin("HU/00302/2025", "HU/00569/2025", "HU/00586/2025","HU/00560/2025"),
#     set={"NationalityId": lit("41")}
# )

# display(bronze_table.toDF().filter(col("CaseNo").isin("HU/00302/2025", "HU/00569/2025", "HU/00586/2025","HU/00560/2025" )).select("CaseNo", "NationalityId"))

# #################
# #valid_oocAddressLine1 valid_oocAddressLine2
# #changing null values to actual values

# bronze_table = DeltaTable.forName(spark, "ariadm_active_appeals.bronze_appealcase_crep_rep_floc_cspon_cfs")
# display(bronze_table.toDF().filter(col("CaseNo").isin("HU/00185/2025", "HU/02151/2024")).select("CaseNo", "CaseRep_Address1", "CaseRep_Address2", "CaseRep_Address3", "CaseRep_Address4" , "CaseRep_Address5", "CaseRep_Postcode"))

# bronze_table.update(
#     condition=col("CaseNo").isin("HU/00185/2025", "HU/02151/2024"),
#     set={"CaseRep_Address1": lit("925 Lisa Plains Apt. 642X"),
#          "CaseRep_Address2" : lit("Hill SquareX"),
#         "CaseRep_Address3" : lit("LynchhavenX"),
#         "CaseRep_Address4" : lit("AustraliaX"),
#         "CaseRep_Address5" : lit("NLX"),
#         # "CaseRep_Postcode" : lit("Hill SquareX"),
         
#          }
# )


# display(bronze_table.toDF().filter(col("CaseNo").isin("HU/00185/2025", "HU/02151/2024")).select("CaseNo", "CaseRep_Address1", "CaseRep_Address2", "CaseRep_Address3", "CaseRep_Address4" , "CaseRep_Address5", "CaseRep_Postcode"))


# #################
# #valid_oocAddressLine1 valid_oocAddressLine2
# #change in ooc4 ReunionX to GuamX

# bronze_table = DeltaTable.forName(spark, "ariadm_active_appeals.bronze_appealcase_crep_rep_floc_cspon_cfs")
# display(bronze_table.toDF().filter(col("CaseNo").isin("HU/02191/2024", "HU/01475/2024")).select("CaseNo", "CaseRep_Address1", "CaseRep_Address2", "CaseRep_Address3", "CaseRep_Address4" , "CaseRep_Address5", "CaseRep_Postcode"))

# bronze_table.update(
#     condition=col("CaseNo").isin("HU/01475/2024", "HU/02191/2024"),
#     set={
#         "CaseRep_Address4" : lit("AustraliaX")                 
#          }
# )


# display(bronze_table.toDF().filter(col("CaseNo").isin("HU/02191/2024", "HU/01475/2024")).select("CaseNo", "CaseRep_Address1", "CaseRep_Address2", "CaseRep_Address3", "CaseRep_Address4" , "CaseRep_Address5", "CaseRep_Postcode"))



# ################################

# cases_to_update = ['EA/02806/2023',
# 'HU/00575/2025',
# 'HU/00581/2025',
# 'HU/00447/2025',
# 'HU/00574/2023',
# 'HU/00591/2025',
# 'EA/00588/2025',
# 'EA/00551/2025',
# 'HU/00304/2025',
# 'EA/00495/2025',
# 'EA/00560/2025',
# 'EA/06826/2022',
# 'EA/00490/2025',
# 'EA/00554/2025',
# 'EA/01778/2024',
# 'HU/00562/2025',
# 'EA/00552/2025',
# 'HU/00511/2025',
# 'EA/00483/2025',
# 'EA/09676/2022',
# 'EA/00493/2025',
# 'HU/00224/2025',
# 'EA/00496/2025',
# 'EA/00538/2025',
# 'EA/00558/2025',
# 'EA/08372/2022',
# 'HU/00822/2024',
# 'HU/00574/2025',
# 'EA/02065/2024',
# 'HU/00442/2025',
# 'EA/00586/2025',
# 'HU/00590/2025',
# 'HU/00569/2025',
# 'EA/00557/2025',
# 'HU/02346/2024',
# 'EA/00562/2025',
# 'HU/00573/2025',
# 'HU/00571/2025',
# 'EA/01893/2023',
# 'EA/00584/2025',
# 'HU/00579/2025',
# 'HU/00555/2025',
# 'HU/00583/2025',
# 'EA/00591/2025',
# 'EA/00556/2025',
# 'EA/00497/2025',
# 'HU/01972/2023',
# 'EA/00437/2025',
# 'HU/00577/2025',
# 'EA/00585/2025',
# 'HU/00252/2025',
# 'HU/00557/2025',
# 'EA/00485/2025',
# 'HU/00563/2025',
# 'HU/00278/2025',
# 'EA/00559/2025',
# 'EA/00553/2025',
# 'HU/00578/2025',
# 'HU/00445/2025',
# 'HU/00325/2025',
# 'EA/00555/2025',
# 'HU/00572/2025',
# 'HU/00582/2025',
# 'EA/00587/2025',
# 'HU/00638/2024',
# 'HU/00453/2025',
# "EA/00289/2025"
# ]


# bronze_table = DeltaTable.forName(spark, "ariadm_active_appeals.bronze_appealcase_caseappellant_appellant")

# display(bronze_table.toDF().filter(col("CaseNo").isin(cases_to_update)).select("CaseNo", "Appellant_Address4"))

# bronze_table.update(
#     condition=col("CaseNo").isin(cases_to_update),
#     set={
#         "Appellant_Address4" : lit("AustraliaX")                 
#          }
# )


# display(bronze_table.toDF().filter(col("CaseNo").isin(cases_to_update)).select("CaseNo", "Appellant_Address4"))

##############################
#PP Data After new data cut 13/2/2026
##############################

#GreeceX

# "EA/06826/2022", "EA/08372/2022", "EA/09676/2022", "HU/02151/2024"

# bronze_table = DeltaTable.forName(spark, "ariadm_active_appeals.bronze_appealcase_caseappellant_appellant")
# display(bronze_table.toDF().filter(col("CaseNo").isin("EA/06826/2022", "EA/08372/2022", "EA/09676/2022", "HU/02151/2024")).select("CaseNo", "Appellant_Address4"))

# # bronze_table.update(
# #     condition=col("CaseNo").isin("EA/06826/2022", "EA/08372/2022", "EA/09676/2022", "HU/02151/2024"),
# #     set={
# #         "Appellant_Address4" : lit("AustraliaX")                 
# #          }
# # )


# display(bronze_table.toDF().filter(col("CaseNo").isin("EA/06826/2022", "EA/08372/2022", "EA/09676/2022", "HU/02151/2024")).select("CaseNo", "Appellant_Address4"))


# HearingCentre related

history table needs changing from silver to bronze: hive_metastore.ariadm_active_appeals.bronze_history

In [0]:
#################
# hearingCentre

history_table = DeltaTable.forName(spark,"ariadm_active_appeals.bronze_history").toDF()

# history_table.display()

appeal_case = DeltaTable.forName(spark,"ariadm_active_appeals.bronze_appealcase_crep_rep_floc_cspon_cfs").toDF()

# appeal_case.display()

hearingCentre_table = history_table.join(
    appeal_case,
    appeal_case.CaseNo == history_table.CaseNo,
    "inner"
).filter(
    (appeal_case.CentreId).isin(517,406,522,13,296,55,101,77,476,13,79,37) &
    (history_table.HistType == 6)
)

hearingCentre_table.display()

In [0]:
# Freeze the first result in memory
hearingCentre_table.cache()
first_count = hearingCentre_table.count()
print(f"First count: {first_count}")

## Req 1
(1) If there's 1 or fewer entires in the history table and CentreId IN (77,476,13,79,37) set the Comment to be the same as the Hearing Centre description

77	Arnhem House, xxxxxxxxxx, xxxxxx

476	Arnhem House (Exceptions)

13	Loughborough

79	North Shields (Kings Court)

37	Not known at this time

(this means we will exercise the assignHearingCentre logic for these cases)

_Avi interpret: where in the hearingCentre table we have CaseNos that only have one HistoryId and a matching CentreId of (77,476,13,79,37), map their Comment accordingly to the specified description_

In [0]:
print(f"First count: {first_count}")

hearingCentre_table1 = hearingCentre_table.filter(
    (appeal_case.CentreId).isin(77, 476, 13, 79, 37)
).select(
    history_table.CaseNo,
    "HistoryId",
    "Comment",
    "CentreId",
)

second_count = hearingCentre_table1.count()
print(f"Second count: {second_count}")

hearingCentre_table1.display()

In [0]:
# group by CaseNo
counts_df = hearingCentre_table1.groupBy("CaseNo").count()

# filter to keep only rows with one CaseNo
single_entry_cases = counts_df.filter("count == 1").select("CaseNo")

# apply this filter to the main table via a join
filtered_table = hearingCentre_table1.join(
    single_entry_cases, 
    "CaseNo", 
    "inner"
)

filtered_table.display()

In [0]:
from pyspark.sql import functions as F

final_mapped_table = filtered_table.withColumn(
    "Comment",
    F.when(filtered_table.CentreId == 77,  "Arnhem House, xxxxxxxxxx, xxxxxx")
     .when(filtered_table.CentreId == 476, "Arnhem House (Exceptions), xxxxxxxxx, xxxxxx")
     .when(filtered_table.CentreId == 13,  "Loughborough, xxxxxxxxx, xxxxxx")
     .when(filtered_table.CentreId == 79,  "North Shields (Kings Court), xxxxxxxxx, xxxxxx")
     .when(filtered_table.CentreId == 37,  "Not known at this time, xxxxxxxxx, xxxxxx")
     .otherwise(filtered_table.Comment) 
)

display(final_mapped_table)

cases_to_update = [row[0] for row in final_mapped_table.select("CaseNo").distinct().collect()]
print(f"Ready to update {len(cases_to_update)} cases.")

In [0]:
# The "Delta Merge" approach
delta_history = DeltaTable.forName(spark, "ariadm_active_appeals.bronze_history")

delta_history.alias("target").merge(
    final_mapped_table.alias("source"),
    "target.HistoryId = source.HistoryId" # Match records by their unique ID
).whenMatchedUpdate(set = {
    "Comment": "source.Comment" # Update the comment in the target with the one from our mapped table
}).execute()

In [0]:
# check to see the IDs actually exist and get seeded in the history table

delta_history.toDF().filter(
  col("CaseNo").isin(cases_to_update)
).orderBy("HistType").display()

## Req 2
(2) IF CentreId IN (77,476,13,79,37) AND there are >1 rows:

Set to any of:

Alloa Sheriff Court

Belfast

Belfast - Laganside

Birmingham IAC (Priory Courts)

Birmingham Magistrates Court (VLC)

Bradford

Glasgow (Eagle Building)

Glasgow (Tribunals Centre)

Harmondsworth

Hatton Cross

Hatton Cross (Fast Track)

Hendon Magistrates Court (HX)

Nottingham Justice Centre

Tameside Magistartes Court

Taylor House

Taylor House (Field House)

Taylor House (HX)

Yarl's Wood

ZZ(DNU)Birmingham IAC Sheldon Court

Arnhem House 

Arnhem House (Exceptions)

Loughborough

North Shields (Kings Court)

_Avi interpret: where in the hearingCentre table we have CaseNos that have more than one HistoryId and a matching CentreId of (77,476,13,79,37), map their Comments to any of the specified fields_

In [0]:
print(f"First count: {first_count}")

hearingCentre_table2 = hearingCentre_table.filter(
    (appeal_case.CentreId).isin(77, 476, 13, 79, 37)
).select(
    history_table.CaseNo,
    "HistoryId",
    "Comment",
    "CentreId",
    "HistType"
)

second_count = hearingCentre_table2.count()
print(f"Second count: {second_count}")

hearingCentre_table2.display()

In [0]:
# group by CaseNo
counts_df = hearingCentre_table2.groupBy("CaseNo").count()

# filter to keep only rows with one CaseNo
single_entry_cases = counts_df.filter("count > 1").select("CaseNo")

# apply this filter to the main table via a join
filtered_table_2 = hearingCentre_table2.join(
    single_entry_cases, 
    "CaseNo", 
    "inner"
)

filtered_table_2.display()

In [0]:
from pyspark.sql import functions as F

# define list of courts
courts = [
    "Alloa Sheriff Court", "Belfast", "Belfast - Laganside", 
    "Birmingham IAC (Priory Courts)", "Birmingham Magistrates Court (VLC)", 
    "Bradford", "Glasgow (Eagle Building)", "Glasgow (Tribunals Centre)", 
    "Harmondsworth", "Hatton Cross", "Hatton Cross (Fast Track)", 
    "Hendon Magistrates Court (HX)", "Nottingham Justice Centre", 
    "Tameside Magistartes Court", "Taylor House", "Taylor House (Field House)", 
    "Taylor House (HX)", "Yarl's Wood", "ZZ(DNU)Birmingham IAC Sheldon Court", 
    "Arnhem House", "Arnhem House (Exceptions)", "Loughborough", 
    "North Shields (Kings Court)"
]

# make an array
court_array = F.array([F.lit(c) for c in courts])

# 3. Apply the mapping
final_mapped_table_2 = filtered_table_2.withColumn(
    "Comment",
    F.when(
        F.col("CentreId").isin(77, 476, 13, 79, 37),
        court_array[(F.rand(seed=5) * len(courts)).cast("int")]
    ).otherwise(F.col("Comment"))
)

final_mapped_table_2.display()

cases_to_update_2 = [row[0] for row in final_mapped_table_2.select("CaseNo").collect()]
print(f"Ready to update {len(cases_to_update_2)} cases.")

In [0]:
# The "Delta Merge" approach
delta_history = DeltaTable.forName(spark, "ariadm_active_appeals.bronze_history")

delta_history.alias("target").merge(
    final_mapped_table_2.alias("source"),
    "target.HistoryId = source.HistoryId" # Match records by their unique ID
).whenMatchedUpdate(set = {
    "Comment": "source.Comment" # Update the comment in the target with the one from our mapped table
}).execute()

In [0]:
# check to see the IDs actually exist and get seeded in the history table

# delta_history.toDF().filter(
#   col("CaseNo").isin(cases_to_update_2)
# ).orderBy("HistType").display()

delta_history.toDF().filter(
  (col("HistType") == 6) &
  (col("CaseNo") == "AA/03752/2006")
).display()


## Req 3
(3) If CentreId IN (517,406,522,13,296,55,101):

For these cases we want to set the Comment at MAX(HistoryId) as 'Castle Park Storage', 'Field House','Field House (TH)', 'UT (IAC) Cardiff CJC',

'UT (IAC) Hearing in Field House',

'UT (IAC) Hearing in Man CJC'

(same as what the CentreId is)



AND THEN for previous comments we set to any of:



Alloa Sheriff Court, xxxxxxxxxx

Belfast, xxxxxxxxxx

Belfast - Laganside

Birmingham IAC (Priory Courts)

Birmingham Magistrates Court (VLC)

Bradford

Glasgow (Eagle Building)

Glasgow (Tribunals Centre)

Harmondsworth

Hatton Cross

Hatton Cross (Fast Track)

Hendon Magistrates Court (HX)

Nottingham Justice Centre

Tameside Magistrates Court

Taylor House

Taylor House (Field House)

Taylor House (HX)

Yarl's Wood

ZZ(DNU)Birmingham IAC Sheldon Court

Arnhem House

Arnhem House (Exceptions)

Loughborough

North Shields (Kings Court)

_Avi interept: where CentreId is in (517,406,522,13,296,55,101), i must seed the comment of the row with the biggest HistoryId. the mappings for what values i need to seed into the comment are in the mapping doc ref data tab. for the remaining comments for the same CaseNo where the HistoryIds are lower than the max HistoryId, i choose randomly from the big list and seed it to any one of those_

In [0]:
print(f"First count: {first_count}")

hearingCentre_table3 = hearingCentre_table.filter(
    (appeal_case.CentreId).isin(517, 406, 522, 13, 296, 55, 101)
).select(
    history_table.CaseNo,
    "HistoryId",
    "Comment",
    "CentreId"
)

second_count = hearingCentre_table3.count()
print(f"Second count: {second_count}")

hearingCentre_table3.display()

In [0]:
from pyspark.sql import functions as F

# identify the Max HistoryId for each Case
max_history_df = hearingCentre_table3.groupBy("CaseNo").agg(F.max("HistoryId").alias("MaxHistoryId"))

# join back to the main table
enriched_table = hearingCentre_table3.join(
    max_history_df, 
    "CaseNo", 
    "inner")

enriched_table.display()

In [0]:
# define the random list (for non-max rows)
courts = [
    "Alloa Sheriff Court, xxxxxxxxx, xxxxxx", 
    "Belfast, xxxxxxxxx, xxxxxx", 
    "Birmingham IAC (Priory Courts), xxxxxxxxx, xxxxxx", 
    "Bradford, xxxxxxxxx, xxxxxx", 
    "Glasgow (Eagle Building), xxxxxxxxx, xxxxxx", 
    "Harmondsworth, xxxxxxxxx, xxxxxx", 
    "Hatton Cross, xxxxxxxxx, xxxxxx", 
    "Nottingham Justice Centre, xxxxxxxxx, xxxxxx", 
    "Taylor House, xxxxxxxxx, xxxxxx", 
    "Yarl's Wood, xxxxxxxxx, xxxxxx"
]
court_array = F.array([F.lit(c) for c in courts])

# apply Logic
final_mapped_table_3 = enriched_table.withColumn(
    "Comment",
    F.when(
        F.col("HistoryId") == F.col("MaxHistoryId"),
        F.when(F.col("CentreId") == 517, "UT (IAC) Hearing in Man CJC, xxxxxxxxx, xxxxxx")
         .when(F.col("CentreId") == 406, "UT (IAC) Hearing in Field House, xxxxxxxxx, xxxxxx")
         .when(F.col("CentreId") == 522, "UT (IAC) Cardiff CJC, xxxxxxxxx, xxxxxx")
         .when(F.col("CentreId") == 13,  "Loughborough, xxxxxxxxx, xxxxxx")
         .when(F.col("CentreId") == 296, "Field House (TH), xxxxxxxxx, xxxxxx")
         .when(F.col("CentreId") == 55,  "Field House, xxxxxxxxx, xxxxxx")
         .when(F.col("CentreId") == 101, "Castle Park Storage, xxxxxxxxx, xxxxxx")
    ).otherwise(
        # random assignment for older historical records
        court_array[(F.rand(seed=5) * len(courts)).cast("int")]
    )
)

final_mapped_table_3 = final_mapped_table_3.drop(col("MaxHistoryId"))

display(final_mapped_table_3.orderBy("CaseNo", "HistoryId"))

In [0]:
# change this to whichever ID you want to check (517, 406, 522, 13, 296, 55, 101)
target_id = 101 

sample_case = final_mapped_table_3.filter(F.col("CentreId") == target_id) \
                         .select("CaseNo") \
                         .distinct() \
                         .orderBy(F.rand()) \
                         .limit(1) \
                         .collect()[0][0]

print(f"Checking results for {target_id} - Random CaseNo: {sample_case}")

# 3. Display the history for that specific case
final_mapped_table_3.filter(F.col("CaseNo") == sample_case) \
           .select("CaseNo", "HistoryId", "CentreId", "Comment") \
           .orderBy("HistoryId") \
           .display()

cases_to_update_3 = [row[0] for row in final_mapped_table_3.select("CaseNo").collect()]
print(f"Ready to update {len(cases_to_update_2)} cases.")

In [0]:
# The "Delta Merge" approach
delta_history = DeltaTable.forName(spark, "ariadm_active_appeals.bronze_history")

delta_history.alias("target").merge(
    final_mapped_table_3.alias("source"),
    "target.HistoryId = source.HistoryId" # Match records by their unique ID
).whenMatchedUpdate(set = {
    "Comment": "source.Comment" # Update the comment in the target with the one from our mapped table
}).execute()

In [0]:
# check to see the IDs actually exist and get seeded in the history table

# delta_history.toDF().orderBy("HistType").display()

delta_history.toDF().filter(
  (col("HistType") == 6) &
  (col("CaseNo") == "EA/06652/2020")
).display() 

In [0]:
delta_silver_history = DeltaTable.forName(spark, "ariadm_active_appeals.silver_history_detail")

delta_silver_history.toDF().filter(
  (col("HistType") == 6) &
  (col("Comment").contains(","))
).display() 