
## Read all the data as required

In [0]:
%run "../includes/configuration"

In [0]:
drivers_df = spark.read.format("parquet") \
                       .load(f"{processed_folder_path}/drivers")\
                       .withColumnRenamed("number", "driver_number")\
                       .withColumnRenamed("name", "driver_name")\
                       .withColumnRenamed("nationality", "driver_nationality")

In [0]:
constructors_df = spark.read.format("parquet") \
                       .load(f"{processed_folder_path}/constructors")\
                       .withColumnRenamed("name","team")

In [0]:
circuits_df = spark.read.format("parquet") \
                       .load(f"{processed_folder_path}/circuits")\
                       .withColumnRenamed("location","circuit_location")

In [0]:
races_df = spark.read.format("parquet") \
                       .load(f"{processed_folder_path}/races")\
                       .withColumnRenamed("name", "race_name")\
                       .withColumnRenamed("race_timestamp", "race_date")

In [0]:
results_df = spark.read.format("parquet") \
                       .load(f"{processed_folder_path}/results")\
                       .withColumnRenamed("time", "race_time")

## Join circuits to races

In [0]:
race_circuits_df = races_df.join(
                                other=circuits_df,
                                on=races_df.circuit_id == circuits_df.circuit_id,
                                how="inner"
                                )\
                                .select(
                                  races_df.race_id,
                                  races_df.race_year,
                                  races_df.race_name,
                                  races_df.race_date,
                                  circuits_df.circuit_location
                                )


## Join results to all other dataframes

In [0]:
race_results_df = results_df.join(race_circuits_df, results_df.race_id == race_circuits_df.race_id)\
                            .join(drivers_df, results_df.driver_id == drivers_df.driver_id)\
                            .join(constructors_df, results_df.constructor_id == constructors_df.constructor_id)

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
final_df = race_results_df.select(
                                  "race_year",
                                  "race_name",
                                  "race_date",
                                  "circuit_location",
                                  "driver_name",
                                  "driver_number",
                                  "driver_nationality",
                                  "team",
                                  "grid",
                                  "fastest_lap",
                                  "race_time",
                                  "points",
                                  "position"
                                 )\
                                .withColumn("created_date", current_timestamp())

In [0]:
display(final_df.filter("race_year == 2020 and race_name == 'Abu Dhabi Grand Prix'")
                .orderBy(final_df.points.desc())
        )

race_year,race_name,race_date,circuit_location,driver_name,driver_number,driver_nationality,team,grid,fastest_lap,race_time,points,position,created_date
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Max Verstappen,33,Dutch,Red Bull,1,14,1:36:28.645,25.0,1.0,2024-08-14T02:57:32.275Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Valtteri Bottas,77,Finnish,Mercedes,2,40,+15.976,18.0,2.0,2024-08-14T02:57:32.275Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Lewis Hamilton,44,British,Mercedes,3,37,+18.415,15.0,3.0,2024-08-14T02:57:32.275Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Alexander Albon,23,Thai,Red Bull,5,42,+19.987,12.0,4.0,2024-08-14T02:57:32.275Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Lando Norris,4,British,McLaren,4,53,+1:00.729,10.0,5.0,2024-08-14T02:57:32.275Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Carlos Sainz,55,Spanish,McLaren,6,48,+1:05.662,8.0,6.0,2024-08-14T02:57:32.275Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Daniel Ricciardo,3,Australian,Renault,11,55,+1:13.748,7.0,7.0,2024-08-14T02:57:32.275Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Pierre Gasly,10,French,AlphaTauri,9,53,+1:29.718,4.0,8.0,2024-08-14T02:57:32.275Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Esteban Ocon,31,French,Renault,10,47,+1:41.069,2.0,9.0,2024-08-14T02:57:32.275Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Lance Stroll,18,Canadian,Racing Point,8,41,+1:42.738,1.0,10.0,2024-08-14T02:57:32.275Z


In [0]:
final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_presentation.race_results")