In [0]:
from functools import reduce
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, coalesce, greatest, lit, explode, date_add, current_date, count
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DateType,
    ArrayType,
)
from docx import Document
from docx.shared import Inches

bronze_mnt = "/mnt/bronze/ARIADM/ACTIVE/APPEALS"

In [0]:
try:
    m1_path = f"{bronze_mnt}/bronze_appealcase_crep_rep_floc_cspon_cfs"
    csv_m1_path = f"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/M1.csv"
    m2_path = f"{bronze_mnt}/bronze_appealcase_caseappellant_appellant"
    csv_m2_path = f"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/M2.csv"
    m3_path = f"{bronze_mnt}/bronze_status_htype_clist_list_ltype_court_lsitting_adj"
    csv_m3_path = f"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/M3.csv" 
    m4_path = f"{bronze_mnt}/bronze_appealcase_transaction_transactiontype"
    csv_m4_path = f"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/M4.csv"
    m5_path = f"{bronze_mnt}/bronze_appealcase_link_linkdetail"
    csv_m5_path = f"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/M5.csv"
    m6_path = f"{bronze_mnt}/bronze_caseadjudicator_adjudicator"
    csv_m6_path = f"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/M6.csv"
    c_path = f"{bronze_mnt}/bronze_appealcategory"
    csv_c_path = f"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/C.csv"
    d_path = f"{bronze_mnt}/bronze_documentsreceived"
    csv_d_path = f"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/D.csv"
    h_path = f"{bronze_mnt}/bronze_history"
    csv_h_path = f"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/H.csv"

    print('Paths printed successfully')
except:
    print(f"Error during fetch: {str(e)}")

Paths printed successfully


# Schema comparison

In this test, columns from the bronze parquets are read in and tested against manually defined expected columns. This tests that the bronze transformations conform to the correct schema. The expected schema is defined manually using information from the Active Appeals LLD. 

In [0]:
expected_m1_columns = [
    "CaseNo",
    "CasePrefix",
    "OutOfTimeIssue",
    "DateLodged",
    "DateAppealReceived",
    "CentreId",
    "NationalityId",
    "AppealTypeId",
    "DeportationDate",
    "RemovalDate",
    "VisitVisaType",
    "DateOfApplicationDecision",
    "HORef",
    "InCamera",
    "CourtPreference",
    "LanguageId",
    "Interpreter",
    "RepresentativeId",
    "CaseRepName",
    "CaseRepAddress1",
    "CaseRepAddress2",
    "CaseRepAddress3",
    "CaseRepAddress4",
    "CaseRepAddress5",
    "CaseRepPostcode",
    "Contact",
    "CaseRepEmail",
    "FileSpecificEmail",
    "RepName",
    "RepAddress1",
    "RepAddress2",
    "RepAddress3",
    "RepAddress4",
    "RepAddress5",
    "RepPostcode",
    "RepEmail",
    "SponsorName",
    "SponsorForenames",
    "SponsorAddress1",
    "SponsorAddress2",
    "SponsorAddress3",
    "SponsorAddress4",
    "SponsorAddress5",
    "SponsorPostcode",
    "SponsorEmail",
    "SponsorTelephone",
    "SponsorAuthorisation",
    "MainRespondentId",
    "DeptId",
    "PaymentRemissionRequested",
    "PaymentRemissionReason",
    "PaymentRemissionGranted",
    "PaymentRemissionReasonNote",
    "LSCReference", 
    "ASFReferenceNo", 
    "DateCorrectFeeReceived"
]

expected_m2_columns = [
    "CaseNo",
    "AppellantName",
    "AppellantForenames",
    "BirthDate",
    "AppellantEmail",
	"AppellantTelephone",
	"AppellantAddress1",
	"AppellantAddress2",
	"AppellantAddress3",
	"AppellantAddress4",
	"AppellantAddress5",
	"AppellantPostcode",
	"AppellantCountryId",
	"FCONumber"
]

expected_m3_columns = [
    "StatusId",
    "CaseNo",
    "CaseStatus",
    "Outcome",
    "HearingDate",
    "CentreId",
    "DecisionDate",
    "Party",
    "DateReceived",
    "OutOfTime",
    "DecisionReserved",
    "AdjudicatorId",
    "AdjSurname",
    "AdjForenames",
    "AdjTitle",
    "DateOfService",
    "AdditionalLanguageId",
    "HearingCentre",
    "CourtName",
    "ListName",
    "ListType",
    "HearingType",
    "StartTime",
    "TimeEstimate",
    "Judge1FTSurname",
    "Judge1FTForenames",
    "Judge1FTTitle",
    "Judge2FTSurname",
    "Judge2FTForenames",
    "Judge2FTTitle",
    "Judge3FTSurname",
    "Judge3FTForenames",
    "Judge3FTTitle",
    "CourtClerkSurname",
    "CourtClerkForenames",
    "CourtClerkTitle",
    "Notes"
]

expected_m4_columns = [
    "CaseNo",
    "TransactionId",
    "TransactionTypeId",
    "ReferringTransactionId",
    "Amount",
    "TransactionDate",
    "Status",
    "SumBalance",
    "SumTotalFee",
    "SumTotalPay"
]

expected_m5_columns = [
    "CaseNo",
    "LinkNo",
    "ReasonLinkId"
]

expected_m6_columns = [
    "CaseNo",
    "Required",
    "JudgeSurname",
    "JudgeForenames",
    "JudgeTitle"
]

expected_c_columns = [
    "CaseNo",
    "CategoryId"
]

expected_d_columns = [
    "CaseNo",
    "ReceivedDocumentId",
    "DateReceived"
]

expected_h_columns = [
    "HistoryId",
    "CaseNo",
    "HistType",
    "Comment"
]

In [0]:
try:
    # M1 Test
    df_m1 = spark.read.format("delta").load(m1_path)
    actual_m1_columns = df_m1.columns
    if set(expected_m1_columns) - set(actual_m1_columns):
        m1_test = False
        m1_diff = f"Missing columns in M1 output: {set(expected_m1_columns) - set(actual_m1_columns)}"
    else:
        m1_test = True
        print("M1 test passed")

    # M2 Test
    df_m2 = spark.read.format("delta").load(m2_path)
    actual_m2_columns = df_m2.columns
    if set(expected_m2_columns) - set(actual_m2_columns):
        m2_test = False
        m2_diff = f"Missing columns in M2 output: {set(expected_m2_columns) - set(actual_m2_columns)}"
    else:
        m2_test = True
        print("M2 test passed")

    # M3 Test
    df_m3 = spark.read.format("delta").load(m3_path)
    actual_m3_columns = df_m3.columns
    if set(expected_m3_columns) - set(actual_m3_columns):
        m3_test = False
        m3_diff = f"Missing columns in M3 output: {set(expected_m3_columns) - set(actual_m3_columns)}"
    else:
        m3_test = True
        print("M3 test passed")

    # M4 Test
    df_m4 = spark.read.format("delta").load(m4_path)
    actual_m4_columns = df_m4.columns
    if set(expected_m4_columns) - set(actual_m4_columns):
        m4_test = False
        m4_diff = f"Missing columns in M4 output: {set(expected_m4_columns) - set(actual_m4_columns)}"
    else:
        m4_test = True
        print("M4 test passed")

    # M5 Test
    df_m5 = spark.read.format("delta").load(m5_path)
    actual_m5_columns = df_m5.columns
    if set(expected_m5_columns) - set(actual_m5_columns):
        m5_test = False
        m5_diff = f"Missing columns in M5 output: {set(expected_m5_columns) - set(actual_m5_columns)}"
    else:
        m5_test = True
        print("M5 test passed")

    # M6 Test
    df_m6 = spark.read.format("delta").load(m6_path)
    actual_m6_columns = df_m6.columns
    if set(expected_m6_columns) - set(actual_m6_columns):
        m6_test = False
        m6_diff = f"Missing columns in M6 output: {set(expected_m6_columns) - set(actual_m6_columns)}"
    else:
        m6_test = True
        print("M6 test passed")

    # C Test
    df_c = spark.read.format("delta").load(c_path)
    actual_c_columns = df_c.columns
    if set(expected_c_columns) - set(actual_c_columns):
        c_test = False
        c_diff = f"Missing columns in C output: {set(expected_c_columns) - set(actual_c_columns)}"
    else:
        c_test = True
        print("C test passed")

    # D Test
    df_d = spark.read.format("delta").load(d_path)
    actual_d_columns = df_d.columns
    if set(expected_d_columns) - set(actual_d_columns):
        d_test = False
        d_diff = f"Missing columns in D output: {set(expected_d_columns) - set(actual_d_columns)}"
    else:
        d_test = True
        print("D test passed")

    # H Test
    df_h = spark.read.format("delta").load(h_path)
    actual_h_columns = df_h.columns
    if set(expected_h_columns) - set(actual_h_columns):
        h_test = False
        h_diff = f"Missing columns in H output: {set(expected_h_columns) - set(actual_h_columns)}"
    else:
        h_test = True
        print("H test passed")

except Exception as e:
    print(f"Error during fetch: {str(e)}")

M1 test passed
M2 test passed
M3 test passed
M4 test passed
M5 test passed
M6 test passed
C test passed
D test passed
H test passed



# `validate content`

 Inputs: 
- `csv_path` : path to Mx CSV of ARIA_NLE_BAK data to compare against Mx generated bronze parquet file
- `mx_path` : path to Mx generated bronze parquet file to compare against Mx CSV of ARIA_NLE_BAK data
- `identifier_column` : manually specified column of data to perform joins around

Outputs:
- List of columns that do not match between the parquet and CSV:
- If `extra_in_csv` > 0 : list of columns contained in the CSV that are not in the parquet
- If `missing_in_csv` > 0 : list of columns contained in the parquet that are not in the CSV


This function is to compare the counts of data in the CSV and parquet files. We are testing that the parquet file contains the same key content from the original CSVs. The CSVs that the parquets are being tested against contain data straight from the ARIA_NLE_BAK database. Therefore, by checking that all of the content in the parquet is present in the CSV file, the transformation is tested to ensure it does not drop all data and any records that are dropped by the transformation are recorded.

| Join Type      | Description                                                                 |
| -------------- | --------------------------------------------------------------------------- |
| **inner**      | Only rows with matching keys in both dataframes                             |
| **left**       | All rows from the left dataframe, and matching rows from the right          |
| **right**      | All rows from the right dataframe, and matching rows from the left          |
| **full**       | All rows from both dataframes, with `null` where there's no match           |
| **left\_semi** | Returns rows from the left dataframe where a match **exists** in the right  |
| **left\_anti** | Returns rows from the left dataframe where **no match** exists in the right |

 Use of a left anti joins is implemented because we want to return counts and lists of entries that are in the left dataframe but not in the right dataframe. 

 For example, the left anti join of CSV and parquet will return everything that is in the CSV but not in the parquet (`extra_in_csv`).

 And the left anti join of parquet and CSV will return everything that is in the parquet but not in the CSV (`missing_in_csv`).

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.functions import trim, lower


def validate_content(csv_path, mx_path, identifier_column):
    try:
        # Load CSV and parquet
        df_csv_raw = spark.read.option("header", "false").csv(csv_path)
        df_parquet = spark.read.format("delta").load(mx_path)

        # Get column names from parquet and assign to CSV
        expected_columns = df_parquet.columns
        df_csv = df_csv_raw.toDF(*expected_columns)

        # Extract distinct identifier values
        parquet_ids = df_parquet.select(identifier_column).distinct()
        csv_ids = df_csv.select(identifier_column).distinct()

        # Check what is extra in the CSV (CSV -> parquet)
        extra_in_csv = csv_ids.join(parquet_ids, on=identifier_column, how="left_anti")
        extra_count = extra_in_csv.count()

        # Row counts
        # print(f"Distinct {identifier_column}s in parquet: {parquet_ids.count()}")
        # print(f"Distinct {identifier_column}s in CSV: {csv_ids.count()}")

        # Summary
        if extra_count == 0:
            return (
                f"All {identifier_column}s in the parquet output match the CSV exactly. \n"
                f"Distinct {identifier_column}s in parquet: {parquet_ids.count()} \n"
                f"Distinct {identifier_column}s in CSV: {csv_ids.count()} \n"
            )
        else:
            extra_list = [row[identifier_column] for row in extra_in_csv.collect()]
            return (
                f"All {identifier_column}s in parquet are present in CSV, but {extra_count} {identifier_column}(s) "
                f"exist in CSV and not in parquet.\nExtra {identifier_column}s: {extra_list} \n"
                f"Distinct {identifier_column}s in parquet: {parquet_ids.count()} \n"
                f"Distinct {identifier_column}s in CSV: {csv_ids.count()} \n"
            )

    except Exception as e:
        print(f"Error during {identifier_column} validation: {str(e)}")


In [0]:
m1_parquet_result = validate_content(csv_m1_path, m1_path, "CaseNo")
m2_parquet_result = validate_content(csv_m2_path, m2_path, "CaseNo")
m3_parquet_result = validate_content(csv_m3_path, m3_path, "StatusId")
m4_parquet_result = validate_content(csv_m4_path, m4_path, "CaseNo")
m5_parquet_result = validate_content(csv_m5_path, m5_path, "CaseNo")
m6_parquet_result = validate_content(csv_m6_path, m6_path, "CaseNo")
c_parquet_result = validate_content(csv_c_path, c_path, "CaseNo")
d_parquet_result = validate_content(csv_d_path, d_path, "CaseNo")
h_parquet_result = validate_content(csv_h_path, h_path, "HistoryId")

print("Content validation results completed")


Content validation results completed


In [0]:
m1_mismatches = []
m2_mismatches = []
m3_mismatches = []
m4_mismatches = []
m5_mismatches = []
m6_mismatches = []
c_mismatches = []
d_mismatches = []
h_mismatches = []

for column in expected_m1_columns:
    result = validate_content(csv_m1_path, m1_path, column)
    if "missing" in result.lower():
        m1_mismatches.append((column, result))

if m1_mismatches == []:
    print("No mismatches found for M1")
else:
    print("Mismatched Columns for M1:")
    for column, message in m1_mismatches:
        print(f"\nColumn: {column}\n{message}")

print("***************************************************")

for column in expected_m2_columns:
    result = validate_content(csv_m2_path, m2_path, column)
    if "missing" in result.lower():
        m2_mismatches.append((column, result))

if m2_mismatches == []:
    print("No mismatches found for M2")
else:
    print("Mismatched Columns for M2:")
    for column, message in m2_mismatches:
        print(f"\nColumn: {column}\n{message}")

print("***************************************************")

for column in expected_m3_columns:
    result = validate_content(csv_m3_path, m3_path, column)
    if "missing" in result.lower():
        m3_mismatches.append((column, result))

if m3_mismatches == []:
    print("No mismatches found for M3")
else:
    print("Mismatched Columns for M3:")
    for column, message in m3_mismatches:
        print(f"\nColumn: {column}\n{message}")

print("***************************************************")

for column in expected_m4_columns:
    result = validate_content(csv_m4_path, m4_path, column)
    if "missing" in result.lower():
        m4_mismatches.append((column, result))

if m4_mismatches == []:
    print("No mismatches found for M4")
else:
    print("Mismatched Columns for M4:")
    for column, message in m4_mismatches:
        print(f"\nColumn: {column}\n{message}")

print("***************************************************")

for column in expected_m5_columns:
    result = validate_content(csv_m5_path, m5_path, column)
    if "missing" in result.lower():
        m5_mismatches.append((column, result))

if m5_mismatches == []:
    print("No mismatches found for M5")
else:
    print("Mismatched Columns for M5:")
    for column, message in m5_mismatches:
        print(f"\nColumn: {column}\n{message}")

print("***************************************************")

for column in expected_m6_columns:
    result = validate_content(csv_m6_path, m6_path, column)
    if "missing" in result.lower():
        m6_mismatches.append((column, result))

if m6_mismatches == []:
    print("No mismatches found for M6")
else:
    print("Mismatched Columns for M6:")
    for column, message in m6_mismatches:
        print(f"\nColumn: {column}\n{message}")

print("***************************************************")

for column in expected_c_columns:
    result = validate_content(csv_c_path, c_path, column)
    if "missing" in result.lower():
        c_mismatches.append((column, result))

if c_mismatches == []:
    print("No mismatches found for C")
else:
    print("Mismatched Columns for C:")
    for column, message in c_mismatches:
        print(f"\nColumn: {column}\n{message}")

print("***************************************************")

for column in expected_d_columns:
    result = validate_content(csv_d_path, d_path, column)
    if "missing" in result.lower():
        d_mismatches.append((column, result))

if d_mismatches == []:
    print("No mismatches found for D")
else:
    print("Mismatched Columns for D:")
    for column, message in d_mismatches:
        print(f"\nColumn: {column}\n{message}")


print("***************************************************")

# for column in ["HistoryId", "CaseNo", "HistType"]:
for column in expected_h_columns:
    result = validate_content(csv_h_path, h_path, column)
    if "missing" in result.lower():
        h_mismatches.append((column, result))

if h_mismatches == []:
    print("No mismatches found for H")
else:
    print("Mismatched Columns for H:")
    for column, message in h_mismatches:
        print(f"\nColumn: {column}\n{message}")

print("***************************************************")

No mismatches found for M1
***************************************************
No mismatches found for M2
***************************************************
No mismatches found for M3
***************************************************
No mismatches found for M4
***************************************************
No mismatches found for M5
***************************************************
No mismatches found for M6
***************************************************
No mismatches found for C
***************************************************
No mismatches found for D
***************************************************
No mismatches found for H
***************************************************


In [0]:
document = Document()

# Add title
document.add_heading("Active Appeals Bronze Transformation Tests", 0)

# M1
document.add_heading("M1 transformation test", 2)
document.add_paragraph(f"Expected columns in {m1_path} tested.")
if m1_test == True:
    document.add_paragraph("M1 transformation test passed - all expected columns present in outputs.")
else:
    document.add_paragraph("M1 transformation test failed - some expected columns missing in outputs.")
    document.add_paragraph(m1_diff)
document.add_paragraph(m1_parquet_result)
if m1_mismatches == []:
    document.add_paragraph("No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.")
else: 
    document.add_paragraph("Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.")

# M2
document.add_heading("M2 transformation test", 2)
document.add_paragraph(f"Expected columns in {m2_path} tested.")
if m2_test == True:
    document.add_paragraph("M2 transformation test passed - all expected columns present in outputs.")
else:
    document.add_paragraph("M2 transformation test failed - some expected columns missing in outputs.")
    document.add_paragraph(m2_diff)
document.add_paragraph(m2_parquet_result)
if m2_mismatches == []:
    document.add_paragraph("No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.")
else: 
    document.add_paragraph("Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.")

# M3
document.add_heading("M3 transformation test", 2)
document.add_paragraph(f"Expected columns in {m3_path} tested.")
if m3_test == True:
    document.add_paragraph("M3 transformation test passed - all expected columns present in outputs.")
else:
    document.add_paragraph("M3 transformation test failed - some expected columns missing in outputs.")
    document.add_paragraph(m3_diff)
document.add_paragraph(m3_parquet_result)
if m3_mismatches == []:
    document.add_paragraph("No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.")
else: 
    document.add_paragraph("Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.")

# M4
document.add_heading("M4 transformation test", 2)
document.add_paragraph(f"Expected columns in {m4_path} tested.")
if m4_test == True:
    document.add_paragraph("M4 transformation test passed - all expected columns present in outputs.")
else:
    document.add_paragraph("M4 transformation test failed - some expected columns missing in outputs.")
    document.add_paragraph(m4_diff)
document.add_paragraph(m4_parquet_result)
if m4_mismatches == []:
    document.add_paragraph("No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.")
else: 
    document.add_paragraph("Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.")

# M5
document.add_heading("M5 transformation test", 2)
document.add_paragraph(f"Expected columns in {m5_path} tested.")
if m5_test == True:
    document.add_paragraph("M5 transformation test passed - all expected columns present in outputs.")
else:
    document.add_paragraph("M5 transformation test failed - some expected columns missing in outputs.")
    document.add_paragraph(m5_diff)
document.add_paragraph(m5_parquet_result)
if m5_mismatches == []:
    document.add_paragraph("No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.")
else: 
    document.add_paragraph("Mismatched columns found - there is missing data in the parquet file, some data from the CSV has been lost.")

# M6
document.add_heading("M6 transformation test", 2)
document.add_paragraph(f"Expected columns in {m6_path} tested.")
if m6_test == True:
    document.add_paragraph("M6 transformation test passed - all expected columns present in outputs.")
else:
    document.add_paragraph("M6 transformation test failed - some expected columns missing in outputs.")
    document.add_paragraph(m6_diff)
document.add_paragraph(m6_parquet_result)
if m6_mismatches == []:
    document.add_paragraph("No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.")
else: 
    document.add_paragraph("Mismatched columns found - there is missing data in the parquet file, some data from the CSV has been lost.")

# C
document.add_heading("'C' AppealCategory transformation test", 2)
document.add_paragraph(f"Expected columns in {c_path} tested.")
if c_test == True:
    document.add_paragraph("'C' AppealCategory transformation test passed - all expected columns present in outputs.")
else:
    document.add_paragraph("'C' AppealCategory test failed - some expected columns missing in outputs.")
    document.add_paragraph(c_diff)
document.add_paragraph(c_parquet_result)
if c_mismatches == []:
    document.add_paragraph("No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.")
else: 
    document.add_paragraph("Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.")

# D
document.add_heading("'D' DocumentsReceived transformation test", 2)
document.add_paragraph(f"Expected columns in {d_path} tested.")
if d_test == True:
    document.add_paragraph("'D' DocumentsReceived transformation test passed - all expected columns present in outputs.")
else:
    document.add_paragraph("'D' DocumentsReceived test failed - some expected columns missing in outputs.")
    document.add_paragraph(d_diff)
document.add_paragraph(d_parquet_result)
if d_mismatches == []:
    document.add_paragraph("No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.")
else: 
    document.add_paragraph("Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.")

# H
document.add_heading("'H' History transformation test", 2)
document.add_paragraph(f"Expected columns in {h_path} tested.")
if h_test == True:
    document.add_paragraph("'H' History transformation test passed - all expected columns present in outputs.")
else:
    document.add_paragraph("'H' History test failed - some expected columns missing in outputs.")
    document.add_paragraph(h_diff)
document.add_paragraph(h_parquet_result)
if h_mismatches == []:
    document.add_paragraph("No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.")
else: 
    document.add_paragraph("Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.")

# Save document
document.save("appeals_bronze_transformations.docx")