# Notebook 3: Autoloader-Based Load
Use Databricks Autoloader to ingest CSV files from the volume into Delta tables.

 📘 When to Run `03_Autoloader_Load.ipynb` vs. `03B_Autoloader_Maintenance_Events.ipynb`

🔁 `03_Autoloader_Load.ipynb`
Use this notebook to ingest **raw sensor data** into the `raw_sensor_data` Delta table.

- 📁 Watches files in: `/Volumes/arao/aerodemo/tmp/raw/`
- 📄 Looks for: `raw_sensor_data_*.csv`
- 🗂 Schema: timestamp, aircraft_id, model, engine_temp, fuel_efficiency, vibration
- ✅ Run this when:
  - You've generated new synthetic raw sensor files
  - You want to simulate ingestion of new telemetry data
  - You're testing DLT pipelines that depend on `raw_sensor_data`

In [0]:

from pyspark.sql.functions import *
from pyspark.sql.types import *

volume_path = "/Volumes/arao/aerodemo/tmp/1"


This setup ensures:

- ✅ Only valid CSV files are processed  
- ✅ Schema is controlled and predictable  
- ✅ Auto Loader correctly tracks ingestion state  
- ✅ Safe for batch-mode re-runs via `.trigger(once=True)`


In [0]:
from pyspark.sql.types import *

# Define correct schema
sensor_schema = StructType([
    StructField("timestamp", TimestampType(), True),
    StructField("aircraft_id", StringType(), True),
    StructField("model", StringType(), True),
    StructField("engine_temp", DoubleType(), True),
    StructField("fuel_efficiency", DoubleType(), True),
    StructField("vibration", DoubleType(), True),
    StructField("altitude", DoubleType(), True),
    StructField("pressure", DoubleType(), True),
    StructField("speed", DoubleType(), True)
])

# Use dedicated path for raw sensor data files
volume_path = "/Volumes/arao/aerodemo/tmp/raw"

# Define Auto Loader stream
raw_df = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("header", "true")
    .option("pathGlobFilter", "*.csv")  # Only process CSV files
    .option("cloudFiles.inferColumnTypes", "true")
    .option("mergeSchema", "true")
    .option("cloudFiles.schemaLocation", f"{volume_path}/schema/raw_sensor_data")
    .schema(sensor_schema)  # Enforce schema to avoid inference errors
    .load(volume_path))

# Optional: Preview schema
raw_df.printSchema()

# Write to Delta table
(raw_df.writeStream
    .format("delta")
    .option("checkpointLocation", f"{volume_path}/checkpoints/raw_sensor_data")
    .option("mergeSchema", "true")
    .outputMode("append")
    .trigger(once=True)
    .table("arao.aerodemo.raw_sensor_data"))

In [0]:
from pyspark.sql.types import *

# Define the schema explicitly
maintenance_schema = StructType([
    StructField("aircraft_id", StringType(), True),
    StructField("event_date", DateType(), True),
    StructField("event_type", StringType(), True)
])

# Use a dedicated path for maintenance events
volume_path = "/Volumes/arao/aerodemo/tmp/maintenance"

# Define Auto Loader stream
events_df = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("header", "true")
    .option("pathGlobFilter", "*.csv")  # Only load CSVs
    .option("cloudFiles.schemaLocation", f"{volume_path}/schema/maintenance_events")  # Track schema
    .schema(maintenance_schema)
    .load(volume_path))

# Write to Delta table
(events_df.writeStream
    .format("delta")
    .option("checkpointLocation", f"{volume_path}/checkpoints/maintenance_events")
    .option("mergeSchema", "true")
    .outputMode("append")
    .trigger(once=True)
    .table("arao.aerodemo.maintenance_events"))