In [0]:
from pyspark.sql.functions import col, count, when

# --- 1. Configuration ---
CATALOG_NAME = "keshcatalog"
GOLD_SCHEMA_NAME = "gold"

# The primary denormalized table we will be aggregating
BASE_TABLE = f"{CATALOG_NAME}.silver.race_results_analysis"

# --- 2. Read the Base Gold Table ---
print(f"--- Reading race_results_analysis table for aggregation: {BASE_TABLE} ---")
try:
    analysis_df = spark.read.table(BASE_TABLE)
    print("✅ race_results_analysis table read successfully.")
except Exception as e:
    print(f"❌ ERROR reading table: {e}")
    # dbutils.notebook.exit("Failed to read base gold table")

# Aggregation 1: Dominant Drivers
We define "dominant" as the drivers with the most wins (position = 1).



In [0]:
print("\n--- Aggregating data to find dominant drivers ---")

dominant_drivers_df = analysis_df \
    .filter(col("position") == 1) \
    .groupBy("driver_name") \
    .agg(count("*").alias("total_wins")) \
    .orderBy(col("total_wins").desc())

# Write the Aggregated Gold Table
dominant_drivers_table = f"{CATALOG_NAME}.{GOLD_SCHEMA_NAME}.dominant_drivers"
print(f"Writing aggregated table: {dominant_drivers_table}")
dominant_drivers_df.write.format("delta").mode("overwrite").saveAsTable(dominant_drivers_table)
print(f"✅ Success! Created dominant drivers table.")

In [0]:
display(dominant_drivers_df)

driver_name,total_wins
Lewis Hamilton,95
Michael Schumacher,91
Sebastian Vettel,53
Alain Prost,51
Ayrton Senna,41
Fernando Alonso,32
Nigel Mansell,31
Jackie Stewart,27
Niki Lauda,25
Jim Clark,25


# Aggregation 2: Dominant Teams (Constructors)
We define "dominant" as the teams with the most wins (position = 1).



In [0]:
print("\n--- Aggregating data to find dominant teams ---")

dominant_constructors_df = analysis_df \
    .filter(col("position") == 1) \
    .groupBy("constructor_name") \
    .agg(count("*").alias("total_wins")) \
    .orderBy(col("total_wins").desc())

# Write the Aggregated Gold Table
dominant_constructors_table = f"{CATALOG_NAME}.{GOLD_SCHEMA_NAME}.dominant_constructors"
print(f"Writing aggregated table: {dominant_constructors_table}")
dominant_constructors_df.write.format("delta").mode("overwrite").saveAsTable(dominant_constructors_table)
print(f"✅ Success! Created dominant teams table.")

In [0]:
display(dominant_constructors_df)

constructor_name,total_wins
Ferrari,239
McLaren,178
Mercedes,115
Williams,114
Red Bull,64
Team Lotus,45
Renault,35
Benetton,27
Tyrrell,23
Brabham,23


In [0]:
from pyspark.sql import functions as F

driver_perf_gold = (
    spark.table("keshcatalog.silver.driver_lap_performance")
    .groupBy("year", "race_id", "race_name", "driver_id", "driver_name")
    .agg(
        F.avg("lap_time_ms").alias("avg_lap_time_ms"),
        F.min("lap_time_ms").alias("fastest_lap_time_ms"),
        F.max("lap_time_ms").alias("slowest_lap_time_ms"),
        F.stddev("lap_time_ms").alias("lap_time_stddev_ms"),
        F.countDistinct("lap").alias("laps_completed"),
        F.sum(F.when(F.col("position") == 1, 1).otherwise(0)).alias("laps_led"),
        F.first("position").alias("start_position"),
        F.last("position").alias("end_position")
    )
    .withColumn("position_change", F.col("start_position") -  F.col("end_position"))
)

# display(driver_perf_gold)
driver_perf_gold.write.mode("overwrite").saveAsTable("keshcatalog.gold.driver_performance")

In [0]:
pitstop_perf_gold = (
    spark.table("keshcatalog.silver.pit_stop_analysis")
    .groupBy("year", "race_id", "race_name", "driver_id", "driver_name")
    .agg(
        F.count("stop").alias("num_pit_stops"),
        F.avg("pit_stop_duration_seconds").alias("avg_pit_stop_duration_s"),
        F.min("pit_stop_duration_seconds").alias("min_pit_stop_duration_s"),
        F.max("pit_stop_duration_seconds").alias("max_pit_stop_duration_s"),
        F.min("lap").alias("first_pit_lap"),
        F.max("lap").alias("last_pit_lap"),
        F.sum("pit_stop_duration_seconds").alias("total_pit_time_s")
    )
)

display(pitstop_perf_gold)
pitstop_perf_gold.write.mode("overwrite").saveAsTable("keshcatalog.gold.pitstop_performance")


In [0]:
race_results_gold = (
    spark.table("keshcatalog.silver.race_results_analysis")
    .withColumn("podium_flag", F.when(F.col("position") <= 3, 1).otherwise(0))
    .withColumn("win_flag", F.when(F.col("position") == 1, 1).otherwise(0))
    .withColumn("position_change", F.col("position") - F.col("grid"))
    .select(
        "race_year",
        "race_id",
        "race_name",
        "circuit_name",
        "circuit_country",
        "driver_id",
        "driver_name",
        "driver_nationality",
        "constructor_name",
        "constructor_nationality",
        F.col("grid").alias("grid_position"),
        F.col("position").alias("finish_position"),
        "position_change",
        "points",
        "laps",
        "race_time",
        "fastest_lap_time",
        "fastest_lap_speed",
        "podium_flag",
        "win_flag"
    )
)

display(race_results_gold)
race_results_gold.write.mode("overwrite").saveAsTable("keshcatalog.gold.race_results")


In [0]:
season_driver_perf_gold = (
    spark.table("keshcatalog.gold.race_results")
    .groupBy("race_year", "driver_id", "driver_name", "driver_nationality")
    .agg(
        F.sum("points").alias("total_points"),
        F.sum("win_flag").alias("races_won"),
        F.sum("podium_flag").alias("podiums"),
        F.avg("finish_position").alias("avg_finish_position"),
        F.avg("grid_position").alias("avg_grid_position"),
        F.avg("position_change").alias("avg_position_gain"),
        F.countDistinct("race_id").alias("races_entered")
    )
)

# display(season_driver_perf_gold)
season_driver_perf_gold.write.mode("overwrite").saveAsTable("keshcatalog.gold.driver_season_performance")


In [0]:
season_constructor_perf_gold = (
    spark.table("keshcatalog.gold.race_results")
    .groupBy("race_year", "constructor_name", "constructor_nationality")
    .agg(
        F.sum("points").alias("total_points"),
        F.sum("win_flag").alias("wins"),
        F.sum("podium_flag").alias("podiums"),
        F.avg("finish_position").alias("avg_finish_position"),
        F.countDistinct("race_id").alias("races_entered")
    )
)

# display(season_constructor_perf_gold)
season_constructor_perf_gold.write.mode("overwrite").saveAsTable("keshcatalog.gold.constructor_season_performance")
