# Access gold layer

In [0]:
#########################
#Setup Config
#########################

config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()
 
print(f"env_code: {lz_key}")  # This won't be redacted
print(f"env_name: {env_name}")  # This won't be redacted
 
KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
print(f"KeyVault_name: {KeyVault_name}")
 
# Service principal credentials
client_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-TENANT-ID")
 
# Storage account names
curated_storage = f"ingest{lz_key}curated{env_name}"
checkpoint_storage = f"ingest{lz_key}xcutting{env_name}"
raw_storage = f"ingest{lz_key}raw{env_name}"
landing_storage = f"ingest{lz_key}landing{env_name}"
external_storage = f"ingest{lz_key}external{env_name}"
 
 
# Spark config for curated storage (Delta table)
spark.conf.set(f"fs.azure.account.auth.type.{curated_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{curated_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{curated_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{curated_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{curated_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{checkpoint_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{checkpoint_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{checkpoint_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{checkpoint_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{checkpoint_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{raw_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{raw_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{raw_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{raw_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{raw_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{landing_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{landing_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{landing_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{landing_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{landing_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{external_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{external_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{external_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{external_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{external_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
AppealState = "paymentPending"
 
# Setting variables for use in subsequent cells
bronze_path = f"abfss://bronze@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/"
silver_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/"
audit_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/AUDIT/{AppealState}"
gold_path = f"abfss://gold@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{AppealState}"
 
 
 
# Print all variables
variables = {
    # "read_hive": read_hive,
    
    "bronze_path": bronze_path,
    "silver_path": silver_path,
    "audit_path": audit_path,
    "gold_path": gold_path,
    "key_vault": KeyVault_name,
    "AppealState": AppealState
 
}
 
display(variables)

In [0]:
from data_types_truth_files.paymentpending_types import PaymentPendingTypes

In [0]:
files = dbutils.fs.ls(gold_path)[-2].path
json = files + "/JSON"
json =  dbutils.fs.ls(json)

In [0]:
from pyspark.sql.functions import input_file_name, regexp_extract, regexp_replace, col

# 1. Extract the paths from your FileInfo list
path_list = [f.path for f in json]

# 2. Read JSONs and grab the filename
raw_df = spark.read.option("multiLine", "true").json(path_list).withColumn("raw_filename", input_file_name())

# 3. Extract and Reformat the Reference Number
# First, we pull 'EA_00366_2025' from the string
# Then, we replace underscores with slashes
final_df = raw_df.withColumn(
    "appealReferenceNumber", 
    regexp_replace(
        regexp_extract(col("raw_filename"), r"APPEALS_(.*)\.json", 1),
        "_", 
        "/"
    )
).drop("raw_filename")

# 4. Final Output
final_df.select("appealReferenceNumber", "*").display()

In [0]:
files = dbutils.fs.ls(gold_path)[-2].path
valid_json_path = files + "/JSON" 

# compare_truth_to_valid_json(valid_json_path, PaymentPendingTypes)

In [0]:
def compare_truth_to_valid_json(valid_path, truth_dict):
    valid_df = (spark.read
                .option("multiLine", "true")
                .json(valid_path))
    
    actual_schema = {field.name: field.dataType.simpleString() for field in valid_df.schema}
    
    mismatches = []
    for field_name, expected_type in truth_dict.items():
        normalized_expected = "string" if expected_type == "str" else expected_type
        
        if field_name in actual_schema:
            actual_type = actual_schema[field_name]
            if actual_type != normalized_expected:
                mismatches.append({"field": field_name, "expected": expected_type, "actual": actual_type})
        else:
            mismatches.append({"field": field_name, "expected": expected_type, "actual": "MISSING"})
            
    return mismatches # Return the raw list

In [0]:
results = compare_truth_to_valid_json(valid_json_path, PaymentPendingTypes)

In [0]:
import os
import pandas as pd
from datetime import datetime

# 1. Define paths
current_dir = os.getcwd() 
output_dir = os.path.join(current_dir, "results", "data type tests")

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"schema_audit_results_{timestamp}.csv"
output_path = os.path.join(output_dir, filename)

os.makedirs(output_dir, exist_ok=True)

# 2. Handle all passed logic
if results:
    df = pd.DataFrame(results)
else:
    df = pd.DataFrame({"Status": ["All Passed"]})

# 3. Save to CSV
df.to_csv(output_path, index=False)

print(f"Results saved to: {output_path}")
if not results:
    print("Audit result: All Passed")
else:
    print("Audit results: mismatches found")