In [0]:
import os

volume_path = "/Volumes/arao/aerodemo/tmp/raw"
display(dbutils.fs.ls(volume_path))

In [0]:
dbutils.fs.rm("/Volumes/arao/aerodemo/tmp/raw/schema", recurse=True)
dbutils.fs.rm("/Volumes/arao/aerodemo/tmp/raw/checkpoints", recurse=True)

In [0]:
import shutil
import os

# Paths to your volume folders
raw_path = "/Volumes/arao/aerodemo/tmp/raw"
maint_path = "/Volumes/arao/aerodemo/tmp/maintenance"

# Remove all CSV files and schema/checkpoint folders
for folder in [raw_path, maint_path]:
    dbutils.fs.rm(f"{folder}/", True)  # True = recursive

In [0]:
raw_path = "/Volumes/arao/aerodemo/tmp/raw"

latest_file = sorted(dbutils.fs.ls(raw_path), key=lambda f: f.modificationTime, reverse=True)
display(spark.read.format("csv").option("header", True).load(latest_file[0].path))

In [0]:
raw_path = "/Volumes/arao/aerodemo/tmp/raw"
files = dbutils.fs.ls(raw_path)

# Show file names
for f in files:
    print(f.path)

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import col

# Define the expected sensor schema
sensor_schema = StructType([
    StructField("timestamp", TimestampType(), True),
    StructField("aircraft_id", StringType(), True),
    StructField("model", StringType(), True),
    StructField("engine_temp", DoubleType(), True),
    StructField("fuel_efficiency", DoubleType(), True),
    StructField("vibration", DoubleType(), True),
    StructField("altitude", DoubleType(), True),
    StructField("airspeed", DoubleType(), True),
    StructField("anomaly_score", DoubleType(), True),
    StructField("oil_pressure", DoubleType(), True),
    StructField("engine_rpm", IntegerType(), True),
    StructField("battery_voltage", DoubleType(), True)
])

# Read the CSV file using schema
df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .schema(sensor_schema) \
    .load("dbfs:/Volumes/arao/aerodemo/tmp/raw/raw_sensor_data_20250516_220905.csv")

df.display()

In [0]:
# Clean schema and checkpoint metadata
dbutils.fs.rm("dbfs:/Volumes/arao/aerodemo/tmp/raw/schema", True)
dbutils.fs.rm("dbfs:/Volumes/arao/aerodemo/tmp/raw/checkpoints", True)
dbutils.fs.rm("dbfs:/Volumes/arao/aerodemo/tmp/maintenance/schema", True)
dbutils.fs.rm("dbfs:/Volumes/arao/aerodemo/tmp/maintenance/checkpoints", True)

In [0]:
dbutils.fs.ls("dbfs:/Volumes/arao/aerodemo/tmp/raw/")

In [0]:
dbutils.fs.rm("dbfs:/Volumes/arao/aerodemo/tmp/raw", recurse=True)

In [0]:
spark.read.table("arao.aerodemo.sensor_features").printSchema()

In [0]:
spark.sql("DROP TABLE IF EXISTS arao.aerodemo.sensor_features_table")

df.write.format("delta").mode("overwrite").saveAsTable("arao.aerodemo.sensor_features_table")

In [0]:
%sql
DESCRIBE TABLE EXTENDED arao.aerodemo.sensor_features;

In [0]:
from pyspark.sql.functions import col, current_timestamp, lit

# Create the full schema from the predictions table
predictions = spark.read.table("arao.aerodemo.anomaly_predictions")

# Add alert columns (but don’t write yet)
from pyspark.sql.functions import current_timestamp, lit
from datetime import datetime

timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")

predictions_with_meta = (
    predictions
    .withColumn("timestamp", col("prediction_date").cast("string"))
    .drop("prediction_date")
    .withColumn("alert_generated_at", current_timestamp())
    .withColumn("batch_id", lit(timestamp_str))
)

In [0]:
# Drop the old table (⚠️ this removes existing alerts)
spark.sql("DROP TABLE IF EXISTS arao.aerodemo.anomaly_alerts")

# Recreate it with full schema
predictions.limit(0).write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("arao.aerodemo.anomaly_alerts")

In [0]:
spark.sql("SHOW TABLES IN arao.aerodemo").show(truncate=False)

In [0]:
%sql
-- Step 1: Create a temp view
CREATE OR REPLACE TEMP VIEW updated_alerts AS
SELECT *,
       date_sub(current_date(), CAST(rand() * 5 AS INT)) AS new_alert_date
FROM arao.aerodemo.anomaly_alerts;

-- Step 2: Overwrite the table (careful!)
CREATE OR REPLACE TABLE arao.aerodemo.anomaly_alerts AS
SELECT
  aircraft_id,
  timestamp,
  model,
  engine_temp,
  fuel_efficiency,
  vibration,
  altitude,
  airspeed,
  oil_pressure,
  engine_rpm,
  battery_voltage,
  event_type,
  avg_engine_temp_7d,
  avg_vibration_7d,
  avg_rpm_7d,
  prev_anomaly,
  days_since_maint,
  manufacturer,
  engine_type,
  capacity,
  range_km,
  predicted_anomaly,
  new_alert_date AS alert_generated_at,
  anomaly_score,
  batch_id
FROM updated_alerts;

In [0]:
spark.read.table("arao.aerodemo.sensor_features") \
  .groupBy("aircraft_id", "timestamp") \
  .count() \
  .filter("count > 1") \
  .orderBy("aircraft_id", "timestamp") \
  .show()

In [0]:
# Use Spark to count records in each DLT table
print("raw_sensor_data:", spark.table("arao.aerodemo.raw_sensor_data").count())
print("cleaned_sensor_data:", spark.table("arao.aerodemo.cleaned_sensor_data").count())
print("maintenance_events:", spark.table("arao.aerodemo.maintenance_events").count())
print("enriched_sensor_data:", spark.table("arao.aerodemo.enriched_sensor_data").count())

In [0]:
dlt_df = spark.table("arao.aerodemo.enriched_sensor_data")

dlt_df.groupBy("aircraft_id", "timestamp") \
      .count() \
      .filter("count > 1") \
      .orderBy("count", ascending=False) \
      .show(truncate=False)

In [0]:
from pyspark.sql.functions import col

df = spark.table("arao.aerodemo.enriched_sensor_data")

df.filter(col("aircraft_id").isNull() | col("timestamp").isNull()).count()

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

pk_window = Window.partitionBy("aircraft_id", "timestamp").orderBy("timestamp")
deduped_df = df.withColumn("row_num", row_number().over(pk_window)) \
               .filter("row_num = 1") \
               .drop("row_num")



In [0]:
from pyspark.sql.functions import count

df = spark.read.table("arao.aerodemo.enriched_sensor_data")
df.groupBy("aircraft_id", "timestamp") \
  .agg(count("*").alias("cnt")) \
  .filter("cnt > 1") \
  .orderBy("cnt", ascending=False) \
  .show(truncate=False)

In [0]:
df = spark.read.table("arao.aerodemo.enriched_sensor_data")
df.select("timestamp").distinct().show(5, truncate=False)

In [0]:
# Drop Delta table
spark.sql("DROP TABLE IF EXISTS arao.aerodemo.raw_sensor_data")

# Drop schema location (clean up previous inferred schema)
dbutils.fs.rm("dbfs:/Volumes/arao/aerodemo/tmp/raw/schema/raw_sensor_data", recurse=True)

In [0]:
dbutils.fs.rm("dbfs:/pipelines/arao/aerodemo/_dlt_metadata/checkpoints/cleaned_sensor_data", recurse=True)

In [0]:
spark.sql("DROP TABLE IF EXISTS arao.aerodemo.cleaned_sensor_data")

In [0]:
spark.read.table("arao.aerodemo.raw_sensor_data").select("timestamp").show(5, truncate=False)

In [0]:
from pyspark.sql import functions as F

df = spark.read.table("arao.aerodemo.enriched_sensor_data")
df.groupBy("aircraft_id", "timestamp").count().filter(F.col("count") > 1).show()

In [0]:
print("raw_sensor_data:", spark.table("arao.aerodemo.raw_sensor_data").count())
print("cleaned_sensor_data:", spark.table("arao.aerodemo.cleaned_sensor_data").count())
print("maintenance_events:", spark.table("arao.aerodemo.maintenance_events").count())
print("enriched_sensor_data:", spark.table("arao.aerodemo.enriched_sensor_data").count())

In [0]:
df = (
    spark.read.format("csv")
    .option("header", "true")
    .load("/Volumes/arao/aerodemo/tmp/raw")
)

df.select("timestamp").show(10, truncate=False)

In [0]:
%sql
USE CATALOG arao;
USE SCHEMA aerodemo;

DROP TABLE IF EXISTS raw_sensor_data;
DROP TABLE IF EXISTS cleaned_sensor_data;
DROP TABLE IF EXISTS maintenance_events;
DROP TABLE IF EXISTS sensor_features;
DROP TABLE IF EXISTS sensor_features_table;
DROP TABLE IF EXISTS prediction_results;
DROP TABLE IF EXISTS anomaly_predictions;
DROP TABLE IF EXISTS anomaly_alerts;
DROP TABLE IF EXISTS anomaly_alerts_sim;

DROP MATERIALIZED VIEW IF EXISTS enriched_sensor_data;
DROP MATERIALIZED VIEW IF EXISTS aircraft_model_reference_dlt;
DROP MATERIALIZED VIEW IF EXISTS aircraft_location_enriched;

DROP VIEW IF EXISTS airport_location_reference;
DROP VIEW IF EXISTS aircraft_location_reference;
DROP VIEW IF EXISTS digital_twin_aircraft_view;
DROP VIEW IF EXISTS digital_twin_engine_view;

DROP TABLE IF EXISTS labeled_test_examples_d431e3a9;
DROP TABLE IF EXISTS unlabeled_examples_d431e3a9;

In [0]:
%sql
-- Drop all relevant tables/views
USE CATALOG arao;
USE SCHEMA aerodemo;

DROP TABLE IF EXISTS raw_sensor_data;
DROP TABLE IF EXISTS cleaned_sensor_data;
DROP TABLE IF EXISTS maintenance_events;
DROP MATERIALIZED VIEW IF EXISTS aircraft_model_reference_dlt;
DROP MATERIALIZED VIEW IF EXISTS enriched_sensor_data;
DROP TABLE IF EXISTS sensor_features;
DROP TABLE IF EXISTS sensor_features_table;
DROP TABLE IF EXISTS prediction_results;
DROP TABLE IF EXISTS anomaly_predictions;
DROP TABLE IF EXISTS anomaly_alerts;
DROP TABLE IF EXISTS anomaly_alerts_sim;
DROP VIEW IF EXISTS digital_twin_engine_view;
DROP VIEW IF EXISTS digital_twin_aircraft_view;
DROP VIEW IF EXISTS airport_location_reference;
DROP VIEW IF EXISTS aircraft_location_reference;
DROP MATERIALIZED VIEW IF EXISTS aircraft_location_enriched;

In [0]:
dbutils.fs.rm("dbfs:/Volumes/arao/aerodemo/tmp/raw/schema/raw_sensor_data", True)
dbutils.fs.rm("dbfs:/Volumes/arao/aerodemo/tmp/maintenance/schema", True)

In [0]:
raw_df = spark.read.option("header", True).csv("/Volumes/arao/aerodemo/tmp/raw/")
maint_df = spark.read.option("header", True).csv("/Volumes/arao/aerodemo/tmp/maintenance/")

In [0]:
print("✅ Sensor rows:", raw_df.count())
print("✅ Maintenance events:", maint_df.count())

In [0]:
raw_df.select("timestamp").show(5, truncate=False)

In [0]:
raw_df = spark.read.option("header", True).csv("/Volumes/arao/aerodemo/tmp/raw/raw_sensor_data_<latest_timestamp>.csv")
raw_df.select("timestamp").show(5, truncate=False)

In [0]:
display(dbutils.fs.ls("/Volumes/arao/aerodemo/tmp/raw/"))

In [0]:
 latest_file = "/Volumes/arao/aerodemo/tmp/raw/raw_sensor_data_20250521_171256.csv"

raw_df = spark.read.option("header", True).csv(latest_file)
raw_df.select("timestamp").show(5, truncate=False)

In [0]:
# Define paths
raw_path = "/Volumes/arao/aerodemo/tmp/raw"
maint_path = "/Volumes/arao/aerodemo/tmp/maintenance"

# Keep only this latest file (update if needed)
latest_raw = "raw_sensor_data_20250521_171256.csv"
latest_maint = "maintenance_events_20250521_171256.csv"

# Clean up raw data files
for file in dbutils.fs.ls(raw_path):
    if file.name.startswith("raw_sensor_data_") and file.name != latest_raw:
        print(f"Deleting: {file.path}")
        dbutils.fs.rm(file.path)

# Clean up maintenance files
for file in dbutils.fs.ls(maint_path):
    if file.name.startswith("maintenance_events_") and file.name != latest_maint:
        print(f"Deleting: {file.path}")
        dbutils.fs.rm(file.path)

In [0]:
raw_df = spark.read.option("header", True).csv("/Volumes/arao/aerodemo/tmp/raw/")
maint_df = spark.read.option("header", True).csv("/Volumes/arao/aerodemo/tmp/maintenance/")

print("✅ Sensor records:", raw_df.count())
print("✅ Maintenance events:", maint_df.count())
raw_df.select("timestamp").show(5, truncate=False)

In [0]:
%sql
TRUNCATE TABLE arao.aerodemo.cleaned_sensor_data;
TRUNCATE TABLE arao.aerodemo.raw_sensor_data;
TRUNCATE TABLE arao.aerodemo.enriched_sensor_data;
TRUNCATE TABLE arao.aerodemo.maintenance_events;

In [0]:
%sql
DROP TABLE IF EXISTS arao.aerodemo.cleaned_sensor_data;
DROP TABLE IF EXISTS arao.aerodemo.raw_sensor_data;
DROP TABLE IF EXISTS arao.aerodemo.enriched_sensor_data;
DROP TABLE IF EXISTS arao.aerodemo.maintenance_events;

In [0]:
# Replace with your actual path if different
dbutils.fs.rm("dbfs:/pipelines/{your_pipeline_id}/_dlt_metadata", recurse=True)

In [0]:
dbutils.fs.rm("dbfs:/Volumes/arao/aerodemo/tmp/raw/", recurse=True)
dbutils.fs.rm("dbfs:/Volumes/arao/aerodemo/tmp/maintenance/", recurse=True)

In [0]:
for table in [
    "raw_sensor_data", 
    "cleaned_sensor_data", 
    "maintenance_events", 
    "aircraft_model_reference", 
    "enriched_sensor_data", 
    "sensor_features",
    "prediction_results",
    "digital_twin_engine_view",
    "digital_twin_aircraft_view",
    "post_dlt_sanity_check"
]:
    print(table, spark.table(f"arao.aerodemo.{table}").count())

In [0]:
display(dbutils.fs.ls("dbfs:/Volumes/arao/aerodemo/tmp/raw"))
display(dbutils.fs.ls("dbfs:/Volumes/arao/aerodemo/tmp/maintenance"))

In [0]:
raw_df = spark.read.option("header", True).csv("/Volumes/arao/aerodemo/tmp/raw")
raw_df.printSchema()
raw_df.show(5, truncate=False)
print("✅ Sensor rows:", raw_df.count())

In [0]:
maint_df = spark.read.option("header", True).csv("/Volumes/arao/aerodemo/tmp/maintenance")
maint_df.printSchema()
maint_df.show(5, truncate=False)
print("✅ Maintenance events:", maint_df.count())

In [0]:
from pyspark.sql.functions import col, to_timestamp

raw_df.select(
    col("timestamp"),
    to_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss").alias("parsed_ts")
).show(5, truncate=False)

In [0]:
raw_df.select(
    [col(c).isNull().alias(f"{c}_is_null") for c in ["timestamp", "aircraft_id", "engine_temp"]]
).groupBy().sum().show()

In [0]:
spark.sql("DROP TABLE IF EXISTS arao.aerodemo.cleaned_sensor_data")

In [0]:
for table in [
    "raw_sensor_data", 
    "cleaned_sensor_data", 
    "maintenance_events", 
    "aircraft_model_reference", 
    "enriched_sensor_data", 
    "sensor_features"
]:
    print(table, spark.table(f"arao.aerodemo.{table}").count())

In [0]:
df = spark.read.table("arao.aerodemo.raw_sensor_data")

df.select("timestamp", "aircraft_id", "engine_temp", "vibration") \
  .filter("engine_temp < 1000 AND vibration >= 0") \
  .show(5)

In [0]:
from pyspark.sql.functions import count

spark.read.table("arao.aerodemo.enriched_sensor_data") \
    .groupBy("aircraft_id", "timestamp") \
    .agg(count("*").alias("dup_count")) \
    .filter("dup_count > 1") \
    .show(5, truncate=False)

In [0]:
%sql
ALTER TABLE arao.aerodemo.sensor_features
ADD CONSTRAINT primaryKey CHECK (aircraft_id IS NOT NULL AND timestamp IS NOT NULL);

In [0]:
spark.createDataFrame(df).write.mode("overwrite").saveAsTable("engines")

In [0]:
spark.createDataFrame(generate_engines_data()).write.mode("overwrite").saveAsTable("arao.aerodemo.engines")
spark.createDataFrame(generate_landing_gear_data()).write.mode("overwrite").saveAsTable("arao.aerodemo.landing_gear")
spark.createDataFrame(generate_avionics_data()).write.mode("overwrite").saveAsTable("arao.aerodemo.avionics_systems")
spark.createDataFrame(generate_cabin_pressurization_data()).write.mode("overwrite").saveAsTable("arao.aerodemo.cabin_pressurization")
spark.createDataFrame(generate_airframe_data()).write.mode("overwrite").saveAsTable("arao.aerodemo.airframe")

In [0]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

# --- Synthetic Data Generators for Components ---

def generate_engines_data(num_records=100):
    return pd.DataFrame({
        'engine_id': [f'engine_{i}' for i in range(num_records)],
        'aircraft_id': [f'A320_101' for _ in range(num_records)],
        'thrust_level': np.random.uniform(50000, 120000, num_records),
        'fuel_consumption_rate': np.random.uniform(2.0, 5.0, num_records),
        'temperature_reading': np.random.uniform(300, 800, num_records),
        'vibration_level': np.random.uniform(0.1, 2.0, num_records),
        'oil_pressure': np.random.uniform(30, 80, num_records)
    })

def generate_landing_gear_data(num_records=100):
    return pd.DataFrame({
        'gear_id': [f'gear_{i}' for i in range(num_records)],
        'aircraft_id': [f'A320_101' for _ in range(num_records)],
        'hydraulic_pressure': np.random.uniform(2000, 4000, num_records),
        'strut_compression': np.random.uniform(5, 15, num_records),
        'brake_wear': np.random.uniform(0, 100, num_records),
        'brake_temperature': np.random.uniform(100, 500, num_records),
        'shock_absorber_status': np.random.uniform(50, 100, num_records)
    })

def generate_avionics_data(num_records=100):
    return pd.DataFrame({
        'avionics_id': [f'avionics_{i}' for i in range(num_records)],
        'aircraft_id': [f'A320_101' for _ in range(num_records)],
        'power_status': np.random.uniform(110, 130, num_records),
        'signal_integrity': np.random.uniform(20, 40, num_records),
        'data_transmission_rate': np.random.uniform(50, 100, num_records),
        'system_temperature': np.random.uniform(20, 50, num_records),
        'error_logs': np.random.randint(0, 10, num_records)
    })

def generate_cabin_pressurization_data(num_records=100):
    return pd.DataFrame({
        'cabin_id': [f'cabin_{i}' for i in range(num_records)],
        'aircraft_id': [f'A320_101' for _ in range(num_records)],
        'cabin_pressure': np.random.uniform(10, 15, num_records),
        'seal_integrity': np.random.uniform(90, 100, num_records),
        'airflow_rate': np.random.uniform(300, 500, num_records),
        'temperature_control': np.random.uniform(18, 25, num_records),
        'humidity_level': np.random.uniform(20, 60, num_records)
    })

def generate_airframe_data(num_records=100):
    return pd.DataFrame({
        'airframe_id': [f'airframe_{i}' for i in range(num_records)],
        'aircraft_id': [f'A320_101' for _ in range(num_records)],
        'stress_points': np.random.uniform(100, 300, num_records),
        'fatigue_crack_growth': np.random.uniform(0, 10, num_records),
        'temperature_fluctuations': np.random.uniform(-30, 50, num_records),
        'structural_integrity': np.random.uniform(50, 100, num_records)
    })

In [0]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

def generate_engines_data(num_records=100):
    data = {
        'engine_id': [f'engine_{i}' for i in range(num_records)],
        'aircraft_id': [f'A320_101' for _ in range(num_records)],  # consistent for demo
        'manufacturer': np.random.choice(['GE', 'Rolls-Royce', 'Pratt & Whitney'], num_records),
        'model': np.random.choice(['CFM56', 'Trent XWB', 'PW1000G'], num_records),
        'thrust_level': np.random.uniform(50000, 120000, num_records),
        'fuel_consumption_rate': np.random.uniform(2.0, 5.0, num_records),
        'temperature_reading': np.random.uniform(300, 800, num_records),
        'vibration_level': np.random.uniform(0.1, 2.0, num_records),
        'oil_pressure': np.random.uniform(30, 80, num_records)
    }
    df = pd.DataFrame(data)
    return df

engines_df = generate_engines_data()
(
    spark.createDataFrame(engines_df)
    .write
    .mode("overwrite")
    .option("mergeSchema", "true")
    .saveAsTable("arao.aerodemo.engines")
)

In [0]:
tables = [
    "sensor_features",
    "prediction_results",
    "digital_twin_engine_view",
    "digital_twin_aircraft_view",
    "component_twins_master",
    "twin_engine",
    "twin_landing_gear",
    "twin_airframe",
    "twin_avionics",
    "twin_cabin_pressurization"
]

for table in tables:
    spark.sql(f"DROP TABLE IF EXISTS arao.aerodemo.{table}")

In [0]:
df = spark.read.table("arao.aerodemo.post_dlt_sanity_check").orderBy("check_time", ascending=False)
df.display()  # or df.show(1) for latest result

In [0]:
from pyspark.sql.functions import col
import os

# Paths and tables
engine_path = "dbfs:/Volumes/arao/aerodemo/tmp/engine"
twin_engine_table = "arao.aerodemo.twin_engine"
health_engine_table = "arao.aerodemo.component_health_engine"

print("✅ Step 1: Checking file presence in raw engine folder...")
engine_files = dbutils.fs.ls(engine_path)
if len(engine_files) == 0:
    print("❌ No files found in engine data path:", engine_path)
else:
    print(f"✅ Found {len(engine_files)} files in engine data path.")

print("\n✅ Step 2: Checking record count in twin_engine...")
try:
    twin_count = spark.read.table(twin_engine_table).count()
    print(f"✔️ twin_engine row count: {twin_count}")
    if twin_count == 0:
        print("⚠️ twin_engine table is empty. Check synthetic data generation or file format issues.")
except Exception as e:
    print("❌ Error reading twin_engine table:", str(e))

print("\n✅ Step 3: Checking record count in component_health_engine...")
try:
    health_count = spark.read.table(health_engine_table).count()
    print(f"✔️ component_health_engine row count: {health_count}")
    if health_count == 0:
        print("⚠️ No records in component_health_engine. Possible upstream issue.")
except Exception as e:
    print("❌ Error reading component_health_engine table:", str(e))

In [0]:
import pandas as pd

df = pd.read_csv("/Volumes/arao/aerodemo/tmp/engine/engines_sample.csv")
print(df['aircraft_id'].value_counts())

In [0]:
%sql
SELECT aircraft_id, COUNT(*) 
FROM arao.aerodemo.twin_engine 
GROUP BY aircraft_id

In [0]:
%sql
SELECT aircraft_id, COUNT(*)
FROM arao.aerodemo.twin_engine
GROUP BY aircraft_id

In [0]:
# Remove engine directory
dbutils.fs.rm("/Volumes/arao/aerodemo/tmp/engine", recurse=True)

# Remove landing gear directory
dbutils.fs.rm("/Volumes/arao/aerodemo/tmp/landing_gear", recurse=True)

# Remove avionics directory
dbutils.fs.rm("/Volumes/arao/aerodemo/tmp/avionics", recurse=True)

# Remove cabin directory
dbutils.fs.rm("/Volumes/arao/aerodemo/tmp/cabin", recurse=True)

# Remove airframe directory
dbutils.fs.rm("/Volumes/arao/aerodemo/tmp/airframe", recurse=True)

In [0]:
# ✅ Check airframe CSV file columns and sample data

airframe_df = (
    spark.read
        .format("csv")
        .option("header", "true")
        .load("/Volumes/arao/aerodemo/tmp/airframe")
)

# Show the column names
print("🔍 Columns in airframe CSV:")
print(airframe_df.columns)

# Show a few sample rows
print("\n🔍 Sample data:")
airframe_df.show(5, truncate=False)

In [0]:
# ✅ Check event_timestamp in component_health_airframe DLT table

health_airframe_df = spark.read.table("arao.aerodemo.component_health_airframe")

print("🔍 Columns in component_health_airframe:")
print(health_airframe_df.columns)

print("\n🔍 Sample data:")
health_airframe_df.select("aircraft_id", "component_id", "event_timestamp", "health_status").show(5, truncate=False)

In [0]:
%sql
DROP TABLE IF EXISTS arao.aerodemo.component_health_airframe;
DROP TABLE IF EXISTS arao.aerodemo.component_health_landing_gear;
DROP TABLE IF EXISTS arao.aerodemo.component_health_avionics;
DROP TABLE IF EXISTS arao.aerodemo.component_health_cabin_pressurization;
DROP TABLE IF EXISTS arao.aerodemo.component_health_engine;

DROP TABLE IF EXISTS arao.aerodemo.digital_twin_component_view;
DROP TABLE IF EXISTS arao.aerodemo.anomaly_alerts_component;

In [0]:
%sql
SELECT DISTINCT aircraft_id, base_airport_code, latitude, longitude
FROM arao.aerodemo.aircraft_location_enriched

In [0]:
# Update A320 series
spark.sql("""
    UPDATE arao.aerodemo.aircraft_airport_map
    SET airport_code = CASE 
        WHEN aircraft_id IN ('A320_101', 'A320_102') THEN 'SFO'
        WHEN aircraft_id = 'A320_103' THEN 'LAX'
        WHEN aircraft_id = 'A320_104' THEN 'LAX'
        ELSE airport_code
    END
""")

# Update A330 series
spark.sql("""
    UPDATE arao.aerodemo.aircraft_airport_map
    SET airport_code = 'JFK'
    WHERE aircraft_id IN ('A330_301', 'A330_302', 'A330_303')
""")

# Update B737 series
spark.sql("""
    UPDATE arao.aerodemo.aircraft_airport_map
    SET airport_code = CASE
        WHEN aircraft_id = 'B737_201' THEN 'ORD'
        WHEN aircraft_id = 'B737_202' THEN 'ORD'
        WHEN aircraft_id = 'B737_203' THEN 'JFK'
        ELSE airport_code
    END
""")

print("✅ Aircraft-to-airport mappings updated!")

In [0]:
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()

folder_path = "/Workspace/Repos/anand.rao@databricks.com/databricks-aerodemo"

notebooks_to_check = [
    # "01_Table_Creation",  # Optional, add when ready
    "02_01_Synthetic_Data_Generation_v2",
    "02_02_Engine_Data_Generation",
    "02_03_CabinPressurization_Data_Generation",
    "02_04_Airframe_Synthetic_Data_Generation",
    "02_05_LandingGear_Data_Generation",
    "02_06_Avionics_Data_Generation",
    "02_07_ElectricalSystems_Data_Generation",
    "02_08_FuelSystems_Data_Generation",
    "02_09_HydraulicSystems_Data_Generation",
    "02_10_EnvironmentalSystems_Data_Generation"
]

# Collect all notebook names in the folder
available_notebooks = []
for obj in w.workspace.list(path=folder_path):
    if obj.object_type == "NOTEBOOK":
        available_notebooks.append(obj.path.split("/")[-1])

# Debug: print all found notebooks
print("📋 Notebooks found in folder:")
for nb in available_notebooks:
    print(f" - {nb}")

# Check each notebook
for notebook in notebooks_to_check:
    if notebook in available_notebooks:
        print(f"✅ Found: {notebook}")
    else:
        print(f"❌ MISSING: {notebook}")

In [0]:
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()

for obj in w.workspace.list(path="/Workspace/Repos"):
    print(f"{obj.object_type}: {obj.path}")

In [0]:
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
print(f"📍 Current notebook path: {notebook_path}")

In [0]:
import os

notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
folder_path = os.path.dirname(notebook_path)

print(f"✅ Detected notebook folder: {folder_path}")

In [0]:
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()

job_id = "173822373344591"

job = w.jobs.get(job_id)

print(f"✅ Job name: {job.settings.name}")
print("✅ Registered tasks:")

for task in job.settings.tasks:
    task_name = task.task_key
    notebook_path = task.notebook_task.notebook_path if task.notebook_task else "N/A"
    print(f" - {task_name}: {notebook_path}")

In [0]:
job_id = "173822373344591"
# Retrieve the job details
retrieved_job = w.jobs.get(job_id)

print(f"✅ Job '{retrieved_job.settings.name}' has the following tasks:")
for task in retrieved_job.settings.tasks:
    if task.notebook_task:
        print(f" - Notebook task: {task.task_key} → {task.notebook_task.notebook_path}")
    if task.pipeline_task:
        print(f" - DLT pipeline task: {task.task_key} → Pipeline ID: {task.pipeline_task.pipeline_id}")

In [0]:
# ✅ Check 1: Unique airport locations
unique_airports = spark.sql("""
    SELECT DISTINCT latitude, longitude 
    FROM arao.aerodemo.aircraft_location_enriched_v2
""").count()
print(f"✅ Unique airport locations: {unique_airports}")

# ✅ Check 2: Unique aircraft IDs with CRITICAL or WARNING alerts
unique_aircraft_alerts = spark.sql("""
    SELECT DISTINCT aircraft_id 
    FROM arao.aerodemo.anomaly_alerts_component
    WHERE health_status IN ('CRITICAL', 'WARNING')
""").count()
print(f"✅ Unique aircraft with CRITICAL/WARNING alerts: {unique_aircraft_alerts}")

# ✅ Check 3: Aircraft with most CRITICAL or WARNING alerts
aircraft_alerts = spark.sql("""
    SELECT aircraft_id, COUNT(*) AS alert_count
    FROM arao.aerodemo.anomaly_alerts_component
    WHERE health_status IN ('CRITICAL', 'WARNING')
    GROUP BY aircraft_id
    ORDER BY alert_count DESC
""").toPandas()

if not aircraft_alerts.empty:
    print("✅ Top aircraft by alert count:")
    print(aircraft_alerts.head(10))
else:
    print("⚠️ No CRITICAL or WARNING alerts found.")

In [0]:
import pandas as pd

# 1️⃣ Check distinct aircraft IDs per model
df_aircraft_count = spark.sql("""
    SELECT model, COUNT(DISTINCT aircraft_id) AS num_aircraft
    FROM arao.aerodemo.sensor_raw_v2
    GROUP BY model
    ORDER BY model
""").toPandas()
print("✅ Aircraft count per model:")
print(df_aircraft_count)

# 2️⃣ Check date range in the raw data
df_date_range = spark.sql("""
    SELECT MIN(timestamp) AS start_date, MAX(timestamp) AS end_date
    FROM arao.aerodemo.sensor_raw_v2
""").toPandas()
print("\n✅ Date range in raw data:")
print(df_date_range)

# 3️⃣ Check the distribution of anomaly scores
df_anomaly_dist = spark.sql("""
    SELECT anomaly_score, COUNT(*) AS count
    FROM arao.aerodemo.sensor_raw_v2
    GROUP BY anomaly_score
    ORDER BY anomaly_score DESC
""").toPandas()
print("\n✅ Anomaly score distribution:")
print(df_anomaly_dist)

# 4️⃣ Check if post-repair data resets metrics
df_post_repair = spark.sql("""
    SELECT aircraft_id, AVG(engine_temp) AS avg_temp_after_repair
    FROM arao.aerodemo.sensor_raw_v2
    WHERE anomaly_score = 0.0  -- After repair
    GROUP BY aircraft_id
    ORDER BY avg_temp_after_repair DESC
    LIMIT 10
""").toPandas()
print("\n✅ Top 10 aircraft by average engine temp (after repair):")
print(df_post_repair)

# 5️⃣ Check maintenance events coverage
df_maintenance = spark.sql("""
    SELECT event_type, COUNT(*) AS num_events
    FROM arao.aerodemo.maintenance_events_v2
    GROUP BY event_type
""").toPandas()
print("\n✅ Maintenance events breakdown:")
print(df_maintenance)

In [0]:
# List all tables in the catalog + schema
df_tables = spark.sql("""
    SHOW TABLES IN arao.aerodemo
""").toPandas()

print("✅ Available tables in arao.aerodemo schema:")
print(df_tables[['tableName', 'isTemporary']])

In [0]:
# List files in the raw sensor input folder (adjust path as needed)
input_path = "dbfs:/mnt/aerodemo/raw_sensor_data/"  # ← replace with your actual raw folder

files = dbutils.fs.ls(input_path)

print(f"✅ Found {len(files)} files in {input_path}")
for f in files:
    print(f.name, f.size)

In [0]:
dbutils.fs.ls("/Volumes/arao/aerodemo/tmp/raw/")
dbutils.fs.ls("/Volumes/arao/aerodemo/tmp/maintenance/")

In [0]:
# Use Spark to read from DBFS (avoids local mount issues)
spark_df = spark.read.option("header", True).csv(latest_file_path)

# Convert to Pandas
latest_df = spark_df.toPandas()

print("✅ Loaded DataFrame:")
print(latest_df.head())

print(f"✅ Total rows: {len(latest_df)}")
print(f"✅ Unique aircraft IDs: {latest_df['aircraft_id'].nunique()}")
print(f"✅ Unique event types: {latest_df['event_type'].unique()}")

In [0]:
latest_df['event_type'].value_counts()
latest_df.groupby('aircraft_id').size().sort_values(ascending=False)
import matplotlib.pyplot as plt

latest_df['event_date'] = pd.to_datetime(latest_df['event_date'])
latest_df['event_date'].hist(bins=20)
plt.title("Event Dates Distribution")
plt.xlabel("Date")
plt.ylabel("Count")
plt.show()

In [0]:
# Check 1: Airport reference
print("✅ Airport reference table sample:")
print(df_airport[['airport_code', 'airport_name', 'latitude', 'longitude']].head())

# Check 2: Aircraft-airport map
df_aircraft_airport = spark.sql("SELECT * FROM arao.aerodemo.aircraft_airport_map").toPandas()
print(f"\n✅ Aircraft-airport map: {len(df_aircraft_airport)} records")
print(f"Available columns: {df_aircraft_airport.columns.tolist()}")
print(df_aircraft_airport.head())

# Check 3: Enriched location table
df_location = spark.sql("""
    SELECT DISTINCT aircraft_id, airport_code, latitude, longitude
    FROM arao.aerodemo.aircraft_location_enriched_v2
""").toPandas()
print(f"\n✅ Enriched aircraft locations: {len(df_location)} distinct aircraft-airport pairs")
print(df_location.head())

# Check 4: Alerting aircraft
df_alerts = spark.sql("""
    SELECT DISTINCT aircraft_id
    FROM arao.aerodemo.anomaly_alerts_component
    WHERE health_status IN ('CRITICAL', 'WARNING')
""").toPandas()
print(f"\n✅ Aircraft triggering CRITICAL/WARNING alerts: {len(df_alerts)} aircraft")
print(df_alerts.head())

# Check 5: Join alerts with location
df_alerts_locations = df_alerts.merge(df_location, on='aircraft_id', how='left')
missing_airports = df_alerts_locations[df_alerts_locations['airport_code'].isna()]
if not missing_airports.empty:
    print("\n⚠️ ALERT: Some alerting aircraft have no airport mapping!")
    print(missing_airports)
else:
    print("\n✅ All alerting aircraft have airport mappings.")

print("\n✅ Summary of alerting aircraft by airport:")
print(df_alerts_locations['airport_code'].value_counts())

In [0]:
import pandas as pd

# Check 1: Airport reference — first, show columns
df_airport = spark.sql("SELECT * FROM arao.aerodemo.airport_reference").toPandas()
print(f"✅ Airport reference table: {len(df_airport)} records")
print(f"Available columns: {df_airport.columns.tolist()}")

In [0]:
# Check if these aircraft exist in synthetic data
print("✅ Checking if alerting aircraft exist in synthetic aircraft list...")
print(df_aircraft_airport[df_aircraft_airport['aircraft_id'].isin(df_alerts['aircraft_id'])])

In [0]:

df_alerts = spark.sql("SELECT DISTINCT aircraft_id FROM arao.aerodemo.anomaly_alerts_component").toPandas()
df_aircraft_airport = spark.sql("SELECT * FROM arao.aerodemo.aircraft_airport_map").toPandas()

merged = df_alerts.merge(df_aircraft_airport, on="aircraft_id", how="left")
missing_airports = merged[merged['airport_code'].isnull()]

if missing_airports.empty:
    print("✅ All alerting aircraft now have airport mappings!")
else:
    print("⚠ Still missing airport mappings for:")
    print(missing_airports)