# Setup cells

In [0]:
%pip install azure-monitor-query
%pip install azure-identity
dbutils.library.restartPython()

In [0]:
config_path = "dbfs:/configs/config.json"
config = spark.read.option("multiline", "true").json(config_path)
first_row = config.first()
env = first_row["env"].strip().lower()
lz_key = first_row["lz_key"].strip().lower()
keyvault_name = f"ingest{lz_key}-meta002-{env}"
client_secret = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-SECRET')
tenant_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-TENANT-ID')
client_id = dbutils.secrets.get(scope=keyvault_name, key='SERVICE-PRINCIPLE-CLIENT-ID') 

In [0]:
from azure.identity import ClientSecretCredential
from azure.monitor.query import LogsQueryClient, LogsQueryStatus
from datetime import datetime, timedelta

desiredState = "pendingPayment"
end_time = datetime.utcnow()
start_time = end_time - timedelta(days=2)

AppealState = "paymentPending"

In [0]:
from azure.identity import ClientSecretCredential
from azure.monitor.query import LogsQueryClient, LogsQueryStatus
from datetime import datetime, timedelta

credential = ClientSecretCredential(
    tenant_id=tenant_id,
    client_id=client_id,
    client_secret=client_secret
)

client = LogsQueryClient(credential)
workspace_id = "9db45d41-bfe4-49dd-9a3b-5da0ab1a95d0"

# First, let's see which tables have data in the last 5 hours
query = """
search *
| where TimeGenerated > ago(1d)
| summarize count() by $table
| sort by count_ desc
"""

response = client.query_workspace(
    workspace_id,
    query,
    timespan=(start_time, end_time)
)

if response.status == LogsQueryStatus.SUCCESS:
    print("Tables with data in the last 3 days:\n")
    for table in response.tables:
        for row in table.rows:
            print(f"Table: {row[0]}, Count: {row[1]}")

# Gathering data from the logs

In [0]:
from azure.identity import ClientSecretCredential
from azure.monitor.query import LogsQueryClient, LogsQueryStatus
from datetime import datetime, timedelta

credential = ClientSecretCredential(
    tenant_id=tenant_id,
    client_id=client_id,
    client_secret=client_secret
)

client = LogsQueryClient(credential)
workspace_id = "9db45d41-bfe4-49dd-9a3b-5da0ab1a95d0"

# Search AppTraces for your specific message
query = """
AppTraces
| where TimeGenerated > ago(1d)
| where Message contains "Validate posting payload"
| project TimeGenerated, Message
| order by TimeGenerated desc
| take 100
"""

response = client.query_workspace(
    workspace_id,
    query,
    timespan=(start_time, end_time)
)

if response.status == LogsQueryStatus.SUCCESS:
    for table in response.tables:
        print(f"Found {len(table.rows)} matching messages\n")
        
        for i, row in enumerate(table.rows[:10]):
            timestamp = row[0]
            message = row[1]
            print(f"\nMessage {i+1}")
            print(f"Time: {timestamp}")
            print(f"Message: {message}")
else:
    print(f"Query failed: {response.status}")

In [0]:
from azure.identity import ClientSecretCredential
from azure.monitor.query import LogsQueryClient, LogsQueryStatus
from datetime import datetime, timedelta
import pandas as pd
import re

credential = ClientSecretCredential(
    tenant_id=tenant_id,
    client_id=client_id,
    client_secret=client_secret
)

client = LogsQueryClient(credential)
workspace_id = "9db45d41-bfe4-49dd-9a3b-5da0ab1a95d0"

query_validate = """
AppTraces
| where TimeGenerated > ago(1d)
| where Message contains "Validate posting payload"
| project TimeGenerated, Message
| order by TimeGenerated desc
| take 1400
"""

response = client.query_workspace(
    workspace_id,
    query_validate,
    timespan=(start_time, end_time)
)

if response.status == LogsQueryStatus.SUCCESS:
    for table in response.tables:
        # Convert to DataFrame
        df_validate = pd.DataFrame(
            data=table.rows,
            columns=table.columns
        )
        
else:
    print(f"Query failed: {response.status}")
    df_validate = pd.DataFrame()

df_validate.display()

In [0]:
# extract the payload and filter for 'pendingPayment'
df_validate['payload_validation'] = df_validate['Message'].str.split("json = ", n=1).str[1]
df_validate['payload_validation'] = df_validate['payload_validation'].str.split("'event_token'", n=1).str[0]

# # filter rows where ariaDesiredState is pendingPayment
df_validate = df_validate[
    df_validate['payload_validation'].str.contains(f"'ariaDesiredState': '{desiredState}' ", na=False)
].copy()

# extract CaseNo
df_validate['CaseNo'] = df_validate['Message'].str.extract(r"'appealReferenceNumber':\s*'([A-Z]+/\d+/\d+)'")

# rename
df_validate = df_validate.rename(columns={'TimeGenerated': 'payload_validation_time'})

# DISTINCT LOGIC: Keep the latest record for each CaseNo - sort by time descending, then drop duplicates on CaseNo
df_validate = df_validate.sort_values('payload_validation_time', ascending=False)
df_validate = df_validate.drop_duplicates(subset=['CaseNo'])

# final selection
df_validate = df_validate[['CaseNo', 'payload_validation', 'payload_validation_time']]
df_validate.display()

# Gathering gold layer data

In [0]:
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()
 
print(f"env_code: {lz_key}")  # This won't be redacted
print(f"env_name: {env_name}")  # This won't be redacted
 
KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
print(f"KeyVault_name: {KeyVault_name}")
 
# Service principal credentials
client_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-TENANT-ID")
 
# Storage account names
curated_storage = f"ingest{lz_key}curated{env_name}"
checkpoint_storage = f"ingest{lz_key}xcutting{env_name}"
raw_storage = f"ingest{lz_key}raw{env_name}"
landing_storage = f"ingest{lz_key}landing{env_name}"
external_storage = f"ingest{lz_key}external{env_name}"
 
 
# Spark config for curated storage (Delta table)
spark.conf.set(f"fs.azure.account.auth.type.{curated_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{curated_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{curated_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{curated_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{curated_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{checkpoint_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{checkpoint_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{checkpoint_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{checkpoint_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{checkpoint_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{raw_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{raw_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{raw_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{raw_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{raw_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{landing_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{landing_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{landing_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{landing_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{landing_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{external_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{external_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{external_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{external_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{external_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Setting variables for use in subsequent cells
bronze_path = f"abfss://bronze@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/"
silver_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/"
audit_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/AUDIT/{AppealState}"
gold_path = f"abfss://gold@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{AppealState}"
 
# Print all variables
variables = {
    # "read_hive": read_hive,
    
    "bronze_path": bronze_path,
    "silver_path": silver_path,
    "audit_path": audit_path,
    "gold_path": gold_path,
    "key_vault": KeyVault_name,
    "AppealState": AppealState
 
}
 
display(variables)

In [0]:
files = dbutils.fs.ls(gold_path)[-1].path
valid_json = files + "/JSON"
json =  dbutils.fs.ls(valid_json)

# print(json)

In [0]:
from pyspark.sql.functions import input_file_name, regexp_extract, regexp_replace, col

# 1. Extract the paths from your FileInfo list
path_list = [f.path for f in json]

# 2. Read JSONs and grab the filename
raw_df = spark.read.option("multiLine", "true").json(path_list).withColumn("raw_filename", input_file_name())

# 3. Extract and Reformat the Reference Number
# First, we pull 'EA_00366_2025' from the string
# Then, we replace underscores with slashes
final_df = raw_df.withColumn(
    "appealReferenceNumber", 
    regexp_replace(
        regexp_extract(col("raw_filename"), r"APPEALS_(.*)\.json", 1),
        "_", 
        "/"
    )
).drop("raw_filename")

# 4. Final Output
final_df.select("appealReferenceNumber", "*").display()

# Test

In [0]:
import ast

# matching case numbers
gold_cases = {row['appealReferenceNumber']: row.asDict() for row in final_df.collect()}
log_pdf = df_validate[['payload_validation']]

mismatches = []
matches = []

# loop through all the log data & check it against the gold
for _, row in log_pdf.iterrows():
    # convert log string into dictionary
    try:
        log_data_full = ast.literal_eval(row['payload_validation'])
        log_data = log_data_full.get('data', {})
    except:
        continue 
    
    case_no = log_data.get('appealReferenceNumber')
    
    # if case number matches one in gold, compare them
    if case_no in gold_cases:
        gold_row = gold_cases[case_no]
        
        # check every field that exists in Gold
        for field, gold_value in gold_row.items():
            log_value = log_data.get(field)
            
            # convert both to strings to ensure a fair comparison (handles numbers/dates)
            if str(gold_value) != str(log_value):
                mismatches.append({
                    "CaseNo": case_no,
                    "Field": field,
                    "Gold_Value": gold_value,
                    "Log_Value": log_value
                })
            else:
                matches.append({
                    "CaseNo": case_no,
                    "Field": field,
                    "Gold_Value": gold_value,
                    "Log_Value": log_value
                })

# 4. Display Results
import pandas as pd
if mismatches:
    fails_df = pd.DataFrame(mismatches)
    pass_df = pd.DataFrame(matches)
    print(f"Test failed - found {len(mismatches)} mismatches across the cases sent for validation.")
    display(fails_df)
else:
    print("Test pass - all matching case numbers have identical data.")
    display(pass_df)