In [0]:
# ✅ Utility Notebook: Post-DLT Run Summary Queries

from pyspark.sql import SparkSession

# Get the active Spark session
spark = SparkSession.getActiveSession()

# 1️⃣ Total component records per aircraft and component type
print("\n🔍 Total component records per aircraft and component type")
component_summary = spark.sql("""
SELECT 
    aircraft_id, 
    component_type, 
    COUNT(*) AS record_count
FROM arao.aerodemo.digital_twin_component_view
GROUP BY aircraft_id, component_type
ORDER BY aircraft_id, component_type
""")
component_summary.show(truncate=False)

# 2️⃣ Alert counts per aircraft and health status
print("\n🔍 Alert counts per aircraft and health status")
alert_summary = spark.sql("""
SELECT 
    aircraft_id, 
    health_status, 
    COUNT(*) AS alert_count
FROM arao.aerodemo.anomaly_alerts_component
GROUP BY aircraft_id, health_status
ORDER BY aircraft_id, health_status
""")
alert_summary.show(truncate=False)

# 3️⃣ Health status over time (daily counts)
print("\n🔍 Health status over time (daily counts)")
health_over_time = spark.sql("""
SELECT 
    TO_DATE(alert_timestamp) AS alert_date,
    aircraft_id,
    health_status,
    COUNT(*) AS count
FROM arao.aerodemo.anomaly_alerts_component
GROUP BY alert_date, aircraft_id, health_status
ORDER BY alert_date, aircraft_id, health_status
""")
health_over_time.show(truncate=False)

print("\n✅ All utility notebook summary queries completed successfully.")

In [0]:
%sql
SELECT component_type, COUNT(*) AS null_event_timestamps
FROM arao.aerodemo.digital_twin_component_view
WHERE event_timestamp IS NULL
GROUP BY component_type

In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from inspect import getsource

spark = SparkSession.getActiveSession()

# ---------- STEP 1: Check raw CSVs ----------
print("\n🔍 Checking raw CSV files")

sources = {
    "airframe": "/Volumes/arao/aerodemo/tmp/airframe",
    "landing_gear": "/Volumes/arao/aerodemo/tmp/landing_gear",
    "avionics": "/Volumes/arao/aerodemo/tmp/avionics",
    "cabin": "/Volumes/arao/aerodemo/tmp/cabin",
    "engine": "/Volumes/arao/aerodemo/tmp/engine"
}

for name, path in sources.items():
    print(f"\n--- {name.upper()} ---")
    df = spark.read.format("csv").option("header", "true").load(path)
    print(f"Columns: {df.columns}")
    df.show(3, truncate=False)
    df.select("event_timestamp").distinct().show(5, truncate=False)

# ---------- STEP 2: Check if .withColumn('event_timestamp') is applied ----------
print("\n🔍 Checking DLT twin reader functions for event_timestamp parsing")

dlt_functions = ["twin_airframe", "twin_landing_gear", "twin_avionics", "twin_cabin_pressurization", "twin_engine"]

for func_name in dlt_functions:
    try:
        func = globals()[func_name]
        print(f"\n--- Checking function: {func_name} ---")
        print(getsource(func))
    except KeyError:
        print(f"⚠️ Function {func_name} not defined in this notebook context. Skipping.")

# ---------- STEP 3: Check downstream component_health tables ----------
print("\n🔍 Checking component_health tables for null event_timestamps")

tables = [
    "component_health_airframe",
    "component_health_landing_gear",
    "component_health_avionics",
    "component_health_cabin_pressurization",
    "component_health_engine"
]

for table in tables:
    print(f"\n--- {table} ---")
    df = spark.read.table(f"arao.aerodemo.{table}")
    df.select("event_timestamp").distinct().show(5, truncate=False)
    null_count = df.filter(F.col("event_timestamp").isNull()).count()
    print(f"❗ Null event_timestamp count: {null_count}")

### 🛠 DLT Pipeline Update Polling Script

This cell uses the Databricks REST API to:

✅ Retrieve the status of a specific DLT pipeline update  
✅ Print details like:
- Update ID
- State (e.g., RUNNING, COMPLETED, FAILED)
- Cause
- Cluster ID
- Start / end timestamps
- Full JSON response

✅ **Optional:** Poll every N seconds until the update reaches a terminal state (COMPLETED, FAILED, or CANCELED)

---

#### 📋 Required Setup:
- Set `DATABRICKS_INSTANCE` to your workspace URL
- Set `TOKEN` to a Databricks PAT (Personal Access Token)  
- Set `DLT_PIPELINE_ID` and `UPDATE_ID` to the pipeline + update you want to track

---

#### 💡 Notes:
- You can adjust the `poll_interval_sec` to control how often it checks the status.
- To skip polling, just run the first request-and-print part only.

In [0]:
import requests
import json
import time
from datetime import datetime

# ---------- CONFIG ----------
DATABRICKS_INSTANCE = "https://e2-demo-field-eng.cloud.databricks.com"
TOKEN = "YOUR_PERSONAL_ACCESS_TOKEN"
DLT_PIPELINE_ID = "a2ccd850-4b28-4f30-9a53-0fd5f5499713"
UPDATE_ID = "2f648154-c61b-4cce-b4da-18115dca1064"
poll_interval_sec = 10  # seconds between checks
# ----------------------------

headers = {
    "Authorization": f"Bearer {TOKEN}",
    "Content-Type": "application/json"
}

get_update_url = f"{DATABRICKS_INSTANCE}/api/2.0/pipelines/{DLT_PIPELINE_ID}/updates/{UPDATE_ID}"

while True:
    response = requests.get(get_update_url, headers=headers)

    if response.status_code != 200:
        print(f"❌ Failed to fetch DLT update: {response.text}")
        break

    result = response.json()
    update = result.get("update", {})
    update_id = update.get("update_id")
    state = update.get("state")
    cause = update.get("cause")
    created_at = update.get('creation_time')
    readable_time = datetime.fromtimestamp(created_at / 1000).strftime('%Y-%m-%d %H:%M:%S') if created_at else "N/A"

    print(f"✅ Update ID: {update_id}")
    print(f" - State: {state}")
    print(f" - Cause: {cause}")
    print(f" - Created at: {readable_time}")
    print(f" - Cluster ID: {update.get('cluster_id')}")
    print(f" - Start time: {update.get('start_time')}")
    print(f" - End time: {update.get('end_time')}")
    print(f" - Full JSON details:\n{json.dumps(update, indent=2)}")

    if state in ["COMPLETED", "FAILED", "CANCELED"]:
        print(f"⚠️ Final state reached: {state}")
        break

    time.sleep(poll_interval_sec)

### 🔍 Feature Store Table Check Cell

This Python **cell** uses the Databricks Feature Store client to:

✅ **List and check the registered feature tables**  
✅ **Print their primary keys** (as recorded in the Feature Store metadata)  
✅ **Fetch and display the full Spark schema** (directly from the underlying Delta table)

---

### 💡 Why Do We Run This Cell?

- **Validate primary key setup**: Ensures the table has primary keys correctly registered for Feature Store usage.  
- **Confirm schema consistency**: Helps cross-check the schema between what was written in Delta and what is expected in downstream ML workflows.  
- **Detect registration or schema issues early**: Quickly flags any misalignments between Feature Store registration and actual table data.

---

### 🔗 What Tables Are Checked?

The cell loops through these feature tables:
- `arao.aerodemo.component_features_engine_table`
- `arao.aerodemo.component_features_landing_gear_table`
- `arao.aerodemo.component_features_airframe_table`
- `arao.aerodemo.component_features_avionics_table`
- `arao.aerodemo.component_features_cabin_pressurization_table`
- `arao.aerodemo.sensor_features_table`

These represent the **final, production-ready feature tables** after  
the DLT materialization + Delta rewrite + Feature Store registration pipeline.

---

✅ **Tip:**  
This check cell is useful to run after major pipeline updates or before handing off tables to ML engineering teams to ensure all metadata is solid and ready!

In [0]:
from databricks.feature_store import FeatureStoreClient

# Initialize Feature Store client
fs = FeatureStoreClient()

# List of your feature tables
feature_tables = [
    "arao.aerodemo.component_features_engine_table",
    "arao.aerodemo.component_features_landing_gear_table",
    "arao.aerodemo.component_features_airframe_table",
    "arao.aerodemo.component_features_avionics_table",
    "arao.aerodemo.component_features_cabin_pressurization_table",
    "arao.aerodemo.sensor_features_table"
]

# Check primary keys and schema (using Spark)
for table in feature_tables:
    try:
        feature_table = fs.get_table(table)
        print(f"\n✅ Table: {table}")
        print("Primary Keys:", feature_table.primary_keys)
        
        # Get schema from Spark
        spark_schema = spark.table(table).schema
        print("Schema:")
        for field in spark_schema.fields:
            print(f"  - {field.name}: {field.dataType.simpleString()}")
    except Exception as e:
        print(f"\n❌ Failed to retrieve {table}: {e}")