In [0]:
dbutils.widgets.text("param_data_source", "")
var_data_source = dbutils.widgets.get("param_data_source")

In [0]:
dbutils.widgets.text("param_file_date", "2021-03-21") # based on the name of the subfolder in blob storage
var_file_date = dbutils.widgets.get("param_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/utils"

# Find race years for which the data is to be reprocessed

In [0]:
race_results_df = spark.read.format("delta").load(f"{PRESENTATION_FOLDER_PATH}/race_results") \
                            .filter(f"file_date = '{var_file_date}'")

In [0]:
race_year_list = df_column_to_list(race_results_df, "race_year")

# DRIVER STANDINGS

In [0]:
from pyspark.sql.functions import col

race_results_df = spark.read.format("delta").load(f"{PRESENTATION_FOLDER_PATH}/race_results") \
                            .filter(col("race_year").isin(race_year_list))

In [0]:
from pyspark.sql.functions import sum, when, count, col 

driver_standings_df = race_results_df \
                    .groupBy("race_year", "driver_name", "driver_nationality") \
                    .agg(sum("points").alias("total_points"), 
                        count(when(col("position") == 1, True)).alias("total_wins"))

In [0]:
from pyspark.sql import Window
from pyspark.sql.functions import asc, desc, rank 

driver_rank_spec = Window.partitionBy("race_year").orderBy(desc("total_points"), desc("total_wins"))
final_df = driver_standings_df.withColumn("rank", rank().over(driver_rank_spec))

In [0]:
# overwrite_partition(race_results_df, "f1_presentation", "driver_standings", "race_year")

In [0]:
merge_condition = "tgt.driver_name = src.driver_name AND tgt.race_year = src.race_year"
merge_delta_data(final_df, "f1_presentation", "driver_standings", PRESENTATION_FOLDER_PATH, merge_condition, "race_year")

In [0]:
%sql 
SELECT * 
FROM f1_presentation.driver_standings
WHERE race_year = 2021

# CONSTRUCTOR STANDING

In [0]:
from pyspark.sql.functions import sum, when, count, col 

constructor_standings_df = race_results_df. \
                    groupBy("race_year", "constructor_name") \
                    .agg(sum("points").alias("total_points"), 
                        count(when(col("position") == 1, True)).alias("total_wins"))

In [0]:
from pyspark.sql import Window
from pyspark.sql.functions import asc, desc, rank 

constructor_rank_spec = Window.partitionBy("race_year").orderBy(desc("total_points"), desc("total_wins"))
final_df = constructor_standings_df.withColumn("rank", rank().over(constructor_rank_spec))

In [0]:
# overwrite_partition(race_results_df, "f1_presentation", "constructor_standings", "race_year")

In [0]:
merge_condition = "tgt.constructor_name = src.constructor_name AND tgt.race_year = src.race_year"
merge_delta_data(final_df, "f1_presentation", "constructor_standings", PRESENTATION_FOLDER_PATH, merge_condition, "race_year")

In [0]:
%sql 
SELECT * 
FROM f1_presentation.constructor_standings
WHERE race_year = 2021