In [0]:
# Databricks Notebook: 03_Gold_Layer_Aggregations_EV
# Purpose: Create business-ready analytics tables from EV Silver layer

# ------------------------------------------------------------------------------
# CELL 1: Setup
# ------------------------------------------------------------------------------
from pyspark.sql import functions as F
from pyspark.sql.window import Window

silver_path = "/mnt/cti/silver/ev_data"
gold_base   = "/mnt/cti/gold/ev_data"

print("✅ Setup complete")
print(f"📂 Silver: {silver_path}")
print(f"📂 Gold base: {gold_base}")

# Helper: quick column-exists check
def has_col(df, col_name: str) -> bool:
    return col_name in df.columns

# ------------------------------------------------------------------------------
# CELL 2: Read Silver layer
# ------------------------------------------------------------------------------
print("\n📖 Reading Silver layer...")

df_silver = spark.read.format("delta").load(silver_path)
silver_count = df_silver.count()

print(f"✅ Loaded {silver_count} EV records from Silver")
print("\n📋 Sample Silver data:")
display(df_silver.limit(10))

print("\n📜 Silver schema:")
df_silver.printSchema()

# ------------------------------------------------------------------------------
# CELL 3: Daily Ingestion Summary
# ------------------------------------------------------------------------------
print("\n📊 Creating Daily Ingestion Summary...")

# Ensure we have ingestion_timestamp for date-based stats
if has_col(df_silver, "ingestion_timestamp"):
    df_daily = (
        df_silver
        .withColumn("ingestion_date", F.to_date("ingestion_timestamp"))
        .groupBy("ingestion_date")
        .agg(
            F.count("*").alias("total_vehicles"),
            F.countDistinct("record_id").alias("unique_records"),
            F.avg("range_for_analysis").alias("avg_range_km"),
            F.min("range_for_analysis").alias("min_range_km"),
            F.max("range_for_analysis").alias("max_range_km")
        )
        .orderBy(F.col("ingestion_date").desc())
    )

    daily_path = f"{gold_base}/daily_ingestion_summary"

    (
        df_daily
        .write
        .format("delta")
        .mode("overwrite")
        .partitionBy("ingestion_date")
        .save(daily_path)
    )

    print(f"✅ Created Daily Ingestion Summary: {df_daily.count()} records")
    display(df_daily)

✅ Setup complete
📂 Silver: /mnt/cti/silver/ev_data
📂 Gold base: /mnt/cti/gold/ev_data

📖 Reading Silver layer...
✅ Loaded 73545 EV records from Silver

📋 Sample Silver data:


Model_Year,Make,Model,ev_group,Electric_Range,range_for_analysis,range_final,range_was_imputed,range_is_zero,latitude,longitude,ingestion_timestamp,source,file_name,year,month,day,record_id,Make_Clean,Model_Clean,Electric_Range_Int,range_bucket_km,latitude_valid,longitude_valid
2019,HYUNDAI,SONATA,Other_EV,28,28,28,False,False,47.75448,-122.15545,2025-11-26T13:23:54.593841Z,EV_Range_Data,/Volumes/dbw_cti_processing/default/raw_data/ev_range_analysis_subset.csv,2025,11,26,001922510feb59b427dcdf6a7fb12217f6f9954c85ea71e4653c064b1ec8e0b1,HYUNDAI,SONATA,28,1–100 km,True,True
2018,MINI,COUNTRYMAN,Other_EV,12,12,12,False,False,45.63248,-122.67156,2025-11-26T13:23:54.593841Z,EV_Range_Data,/Volumes/dbw_cti_processing/default/raw_data/ev_range_analysis_subset.csv,2025,11,26,06e16d875f8d38779c8a64c3941381ba43686b42cb8b4b1d000866d17c028379,MINI,COUNTRYMAN,12,1–100 km,True,True
2017,MERCEDES-BENZ,GLE-CLASS,Other_EV,10,10,10,False,False,47.41185,-122.17743,2025-11-26T13:23:54.593841Z,EV_Range_Data,/Volumes/dbw_cti_processing/default/raw_data/ev_range_analysis_subset.csv,2025,11,26,07671080564b004f7b0043b65cd271bbdad98ca5131b5c99d71d04fd0d9e0184,MERCEDES-BENZ,GLE-CLASS,10,1–100 km,True,True
2023,HYUNDAI,SANTA FE,Other_EV,30,30,30,False,False,47.75781,-122.3175,2025-11-26T13:23:54.593841Z,EV_Range_Data,/Volumes/dbw_cti_processing/default/raw_data/ev_range_analysis_subset.csv,2025,11,26,096d54e0649fb3bc20e585f396e6e7ca0aaa27797d78dc71bfcc00c63d1c9fc1,HYUNDAI,SANTA FE,30,1–100 km,True,True
2024,TOYOTA,PRIUS PRIME (PHEV),Other_EV,39,39,39,False,False,47.88936,-122.27557,2025-11-26T13:23:54.593841Z,EV_Range_Data,/Volumes/dbw_cti_processing/default/raw_data/ev_range_analysis_subset.csv,2025,11,26,0bac5b643d5dc38fa75784f1cb72e826a964aabdfb3fec29475b0ffb828fd450,TOYOTA,PRIUS PRIME (PHEV),39,1–100 km,True,True
2019,CHEVROLET,BOLT EV,Other_EV,238,238,238,False,False,45.63201,-122.62934,2025-11-26T13:23:54.593841Z,EV_Range_Data,/Volumes/dbw_cti_processing/default/raw_data/ev_range_analysis_subset.csv,2025,11,26,0bf9df0df1d9ec1180423f50f71f4dd4ac3a9c2a253ce6939937a7065e8deb96,CHEVROLET,BOLT EV,238,201–300 km,True,True
2015,FORD,FUSION,Other_EV,19,19,19,False,False,47.13959,-122.27575,2025-11-26T13:23:54.593841Z,EV_Range_Data,/Volumes/dbw_cti_processing/default/raw_data/ev_range_analysis_subset.csv,2025,11,26,0c69a0d53cf5f1008aae5bac76fdf5babc3329f42759c46f8b8ec71f7bc3b7c2,FORD,FUSION,19,1–100 km,True,True
2017,KIA,SOUL,Other_EV,93,93,93,False,False,47.55584,-122.12096,2025-11-26T13:23:54.593841Z,EV_Range_Data,/Volumes/dbw_cti_processing/default/raw_data/ev_range_analysis_subset.csv,2025,11,26,0dad70285323d77016b76d75cd3666392f143da9fe3c9561627c2613b8e24228,KIA,SOUL,93,1–100 km,True,True
2020,TESLA,MODEL Y,Other_EV,291,291,291,False,False,45.72902,-121.52357,2025-11-26T13:23:54.593841Z,EV_Range_Data,/Volumes/dbw_cti_processing/default/raw_data/ev_range_analysis_subset.csv,2025,11,26,0e3f1fca9c5ea2200943df0129219ea0dc8159034e6e9c2cbcfd785495de4fb5,TESLA,MODEL Y,291,201–300 km,True,True
2023,AUDI,Q5 E,Other_EV,23,23,23,False,False,47.65699,-117.1748,2025-11-26T13:23:54.593841Z,EV_Range_Data,/Volumes/dbw_cti_processing/default/raw_data/ev_range_analysis_subset.csv,2025,11,26,11318c6b383cd0764791dd7354dd12d513fe376e824770206f6987c5a9766971,AUDI,Q5 E,23,1–100 km,True,True



📜 Silver schema:
root
 |-- Model_Year: integer (nullable = true)
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- ev_group: string (nullable = true)
 |-- Electric_Range: integer (nullable = true)
 |-- range_for_analysis: integer (nullable = true)
 |-- range_final: integer (nullable = true)
 |-- range_was_imputed: boolean (nullable = true)
 |-- range_is_zero: boolean (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)
 |-- source: string (nullable = true)
 |-- file_name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- record_id: string (nullable = true)
 |-- Make_Clean: string (nullable = true)
 |-- Model_Clean: string (nullable = true)
 |-- Electric_Range_Int: integer (nullable = true)
 |-- range_bucket_km: string (nullable = true)
 |-- latitude_valid: boolean (null

ingestion_date,total_vehicles,unique_records,avg_range_km,min_range_km,max_range_km
2025-11-26,73545,73545,83.23166768645048,1,337
