In [0]:
# ✅ Utility Notebook: Post-DLT Run Summary Queries

from pyspark.sql import SparkSession

# Get the active Spark session
spark = SparkSession.getActiveSession()

# 1️⃣ Total component records per aircraft and component type
print("\n🔍 Total component records per aircraft and component type")
component_summary = spark.sql("""
SELECT 
    aircraft_id, 
    component_type, 
    COUNT(*) AS record_count
FROM arao.aerodemo.digital_twin_component_view
GROUP BY aircraft_id, component_type
ORDER BY aircraft_id, component_type
""")
component_summary.show(truncate=False)

# 2️⃣ Alert counts per aircraft and health status
print("\n🔍 Alert counts per aircraft and health status")
alert_summary = spark.sql("""
SELECT 
    aircraft_id, 
    health_status, 
    COUNT(*) AS alert_count
FROM arao.aerodemo.anomaly_alerts_component
GROUP BY aircraft_id, health_status
ORDER BY aircraft_id, health_status
""")
alert_summary.show(truncate=False)

# 3️⃣ Health status over time (daily counts)
print("\n🔍 Health status over time (daily counts)")
health_over_time = spark.sql("""
SELECT 
    TO_DATE(alert_timestamp) AS alert_date,
    aircraft_id,
    health_status,
    COUNT(*) AS count
FROM arao.aerodemo.anomaly_alerts_component
GROUP BY alert_date, aircraft_id, health_status
ORDER BY alert_date, aircraft_id, health_status
""")
health_over_time.show(truncate=False)

print("\n✅ All utility notebook summary queries completed successfully.")

In [0]:
%sql
SELECT component_type, COUNT(*) AS null_event_timestamps
FROM arao.aerodemo.digital_twin_component_view
WHERE event_timestamp IS NULL
GROUP BY component_type

In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from inspect import getsource

spark = SparkSession.getActiveSession()

# ---------- STEP 1: Check raw CSVs ----------
print("\n🔍 Checking raw CSV files")

sources = {
    "airframe": "/Volumes/arao/aerodemo/tmp/airframe",
    "landing_gear": "/Volumes/arao/aerodemo/tmp/landing_gear",
    "avionics": "/Volumes/arao/aerodemo/tmp/avionics",
    "cabin": "/Volumes/arao/aerodemo/tmp/cabin",
    "engine": "/Volumes/arao/aerodemo/tmp/engine"
}

for name, path in sources.items():
    print(f"\n--- {name.upper()} ---")
    df = spark.read.format("csv").option("header", "true").load(path)
    print(f"Columns: {df.columns}")
    df.show(3, truncate=False)
    df.select("event_timestamp").distinct().show(5, truncate=False)

# ---------- STEP 2: Check if .withColumn('event_timestamp') is applied ----------
print("\n🔍 Checking DLT twin reader functions for event_timestamp parsing")

dlt_functions = ["twin_airframe", "twin_landing_gear", "twin_avionics", "twin_cabin_pressurization", "twin_engine"]

for func_name in dlt_functions:
    try:
        func = globals()[func_name]
        print(f"\n--- Checking function: {func_name} ---")
        print(getsource(func))
    except KeyError:
        print(f"⚠️ Function {func_name} not defined in this notebook context. Skipping.")

# ---------- STEP 3: Check downstream component_health tables ----------
print("\n🔍 Checking component_health tables for null event_timestamps")

tables = [
    "component_health_airframe",
    "component_health_landing_gear",
    "component_health_avionics",
    "component_health_cabin_pressurization",
    "component_health_engine"
]

for table in tables:
    print(f"\n--- {table} ---")
    df = spark.read.table(f"arao.aerodemo.{table}")
    df.select("event_timestamp").distinct().show(5, truncate=False)
    null_count = df.filter(F.col("event_timestamp").isNull()).count()
    print(f"❗ Null event_timestamp count: {null_count}")