In [0]:
dbutils.widgets.text("param_data_source", "")
var_data_source = dbutils.widgets.get("param_data_source")

In [0]:
dbutils.widgets.text("param_file_date", "2021-03-21") # based on the name of the subfolder in blob storage
var_file_date = dbutils.widgets.get("param_file_date")

# Importing necessary files

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/utils"

In [0]:
circuits_df = spark.read.format("delta").load(f"{PROCESSED_FOLDER_PATH}/circuits").withColumnRenamed("location", "circuit_location")

In [0]:
constructors_df = spark.read.format("delta").load(f"{PROCESSED_FOLDER_PATH}/constructors").withColumnRenamed("name", "constructor_name")

In [0]:
drivers_df = spark.read.format("delta").load(f"{PROCESSED_FOLDER_PATH}/drivers") \
                        .withColumnRenamed("number", "driver_number") \
                        .withColumnRenamed("name", "driver_name") \
                        .withColumnRenamed("nationality", "driver_nationality")


In [0]:
races_df = spark.read.format("delta").load(f"{PROCESSED_FOLDER_PATH}/races") \
                        .withColumnRenamed("name", "race_name") \
                        .withColumnRenamed("race_timestamp", "race_date")

In [0]:
results_df = spark.read.format("delta").load(f"{PROCESSED_FOLDER_PATH}/results") \
                        .filter(f"file_date = '{var_file_date}'") \
                        .withColumnRenamed("time", "race_time") \
                        .withColumnRenamed("race_id", "result_race_id") \
                        .withColumnRenamed("file_date", "result_file_date")

# Join the dataframes

In [0]:
races_circuits_df = races_df.join(circuits_df, races_df.circuit_id == circuits_df.circuit_id, "inner") \
                            .select(races_df.race_id, races_df.race_year, races_df.race_name, races_df.race_date, circuits_df.circuit_location)

In [0]:
race_results_df = results_df.join(races_circuits_df, results_df.result_race_id == races_circuits_df.race_id) \
                            .join(constructors_df, results_df.constructor_id == constructors_df.constructor_id) \
                            .join(drivers_df, results_df.driver_id == drivers_df.driver_id)

In [0]:
from pyspark.sql.functions import date_trunc, from_utc_timestamp, current_timestamp

In [0]:
# here we use race_id with driver_name
final_df = race_results_df.select("race_id", "race_year", "race_name", "race_date", "circuit_location", "driver_name", "driver_number", "driver_nationality", "constructor_name", "grid", "fastest_lap", "race_time", "points", "position", "result_file_date") \
    .withColumn("created_date", date_trunc("second", 
                                        from_utc_timestamp(current_timestamp(), "Europe/Amsterdam"))) \
    .withColumnRenamed("result_file_date", "file_date")

In [0]:
# overwrite_partition(final_df, "f1_presentation", "race_results", "race_id")

In [0]:
merge_condition = "tgt.driver_name = src.driver_name AND tgt.race_id = src.race_id"
merge_delta_data(final_df, "f1_presentation", "race_results", PRESENTATION_FOLDER_PATH, merge_condition, "race_id")

In [0]:
%sql
SELECT * FROM f1_presentation.race_results WHERE race_year = 2021