## Race Results

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
file_date= dbutils.widgets.get("p_file_date")
file_date

Out[57]: '2021-03-28'

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
# drivers_df = spark.read.parquet(f"{processed_folder_path}/drivers") \
#     .withColumnRenamed('number', 'driver_number') \
#     .withColumnRenamed('name', 'driver_name') \
#     .withColumnRenamed('nationality', 'driver_nationality')

In [0]:
drivers_df = spark.read.format("delta").load(f"{processed_folder_path}/drivers") \
    .withColumnRenamed('number', 'driver_number') \
    .withColumnRenamed('name', 'driver_name') \
    .withColumnRenamed('nationality', 'driver_nationality')

In [0]:
constructors_df = spark.read.format("delta").load(f"{processed_folder_path}/constructors") \
    .withColumnRenamed('name', 'team')

processed


In [0]:
circuits_df = spark.read.format("delta").load(f"{processed_folder_path}/circuits") \
    .withColumnRenamed('location', 'circuit_location')

In [0]:
races_df = spark.read.format("delta").load(f"{processed_folder_path}/races") \
    .withColumnRenamed('name', 'race_name') \
    .withColumnRenamed('race_timestamp', 'race_date')

In [0]:
results_df = spark.read.format("delta").load(f"{processed_folder_path}/results") \
    .filter(f"file_date = '{file_date}'") \
    .withColumnRenamed('time', 'race_time') \
    .withColumnRenamed('race_id', 'result_race_id') \
    .withColumnRenamed('file_date', 'result_file_date')

In [0]:
display(results_df)

result_id,driver_id,constructor_id,number,grid,position,position_text,position_order,points,laps,race_time,milliseconds,fastest_lap,rank,fastest_lap_time,fastest_lap_speed,data_source,result_file_date,ingestion_timestamp,result_race_id
24966,1,131,44,2,1.0,1,1,25.0,56,1:32:03.897,5523897.0,44.0,4,1:34.015,207.235,Ergast API,2021-03-28,2025-02-27T10:00:07.967+0000,1052
24984,4,214,14,9,,R,19,0.0,32,\N,,31.0,17,1:36.063,202.816,Ergast API,2021-03-28,2025-02-27T10:00:07.967+0000,1052
24976,8,51,7,14,11.0,11,11,0.0,56,+88.864,5612761.0,45.0,14,1:35.192,204.672,Ergast API,2021-03-28,2025-02-27T10:00:07.967+0000,1052
24980,20,117,5,20,15.0,15,15,0.0,55,\N,,26.0,16,1:35.566,203.871,Ergast API,2021-03-28,2025-02-27T10:00:07.967+0000,1052
24970,815,9,11,0,5.0,5,5,10.0,56,+52.047,5575944.0,44.0,3,1:33.970,207.334,Ergast API,2021-03-28,2025-02-27T10:00:07.967+0000,1052
24972,817,1,3,6,7.0,7,7,6.0,56,+66.004,5589901.0,36.0,10,1:34.932,205.233,Ergast API,2021-03-28,2025-02-27T10:00:07.967+0000,1052
24968,822,131,77,3,3.0,3,3,16.0,56,+37.383,5561280.0,56.0,1,1:32.090,211.566,Ergast API,2021-03-28,2025-02-27T10:00:07.967+0000,1052
24967,830,9,33,1,2.0,2,2,18.0,56,+0.745,5524642.0,41.0,2,1:33.228,208.984,Ergast API,2021-03-28,2025-02-27T10:00:07.967+0000,1052
24973,832,6,55,8,8.0,8,8,4.0,56,+67.100,5590997.0,48.0,7,1:34.509,206.151,Ergast API,2021-03-28,2025-02-27T10:00:07.967+0000,1052
24978,839,214,31,16,13.0,13,13,0.0,55,\N,,33.0,15,1:35.250,204.548,Ergast API,2021-03-28,2025-02-27T10:00:07.967+0000,1052


In [0]:
race_circuits_df = races_df.join(circuits_df, races_df.circuit_id == circuits_df.circuit_id, how='inner') \
    .select(races_df.race_id, races_df.race_year, races_df.race_name, races_df.race_date, circuits_df.circuit_location)

final_df = results_df.join(race_circuits_df, results_df.result_race_id == race_circuits_df.race_id, how='inner') \
                            .join(drivers_df, results_df.driver_id == drivers_df.driver_id, how='inner' ) \
                            .join(constructors_df, results_df.constructor_id == constructors_df.constructor_id, how='inner') \
                            .select("race_id", "race_year", "race_name", "race_date", "circuit_location", "driver_name", "driver_number", "driver_nationality",
                                    "team", "grid", "fastest_lap", "race_time", "points", "position", "result_file_date") \
                            .withColumn("created_date", current_timestamp()) \
                            .withColumnRenamed("result_file_date", "file_date")

In [0]:
#display(final_df.filter("race_year == 2020 and race_name == 'Abu Dhabi Grand Prix'").orderBy(final_df.points.desc()))
#display(final_df)

In [0]:
#final_df.write.mode("append").parquet(f"{presentation_folder_path}/race_results")
#database
#final_df.write.mode("overwrite").format('parquet').saveAsTable("f1_presentation.race_results")

#save_table(final_df, "f1_presentation", "race_results", "race_id")

merge_condition = "tgt.race_id = src.race_id AND tgt.driver_name = src.driver_name" #should be used id instead of name
merge_table(final_df, "f1_presentation", "race_results", "race_id", merge_condition)

In [0]:
%sql
--DROP TABLE f1_presentation.race_results