### Read the JSON file 

In [0]:
dbutils.widgets.text("param_data_source", "")
var_data_source = dbutils.widgets.get("param_data_source")

In [0]:
dbutils.widgets.text("param_file_date", "2021-03-28") # based on the name of the subfolder in blob storage
var_file_date = dbutils.widgets.get("param_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/utils"

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [0]:
pit_stop_schema = StructType([StructField("raceId", IntegerType(), False), 
                              StructField("driverId", IntegerType(), True),
                              StructField("stop", StringType(), True),
                              StructField("lap", IntegerType(), True),
                              StructField("time", StringType(), True),
                              StructField("duration", StringType(), True),
                              StructField("milliseconds", IntegerType(), True),
                            ])


pit_stops_df = spark.read \
    .schema(pit_stop_schema) \
    .option("multiline", True) \
    .json(f"{RAW_FOLDER_PATH}/{var_file_date}/pit_stops.json")   


### rename columns and add new column

In [0]:
from pyspark.sql.functions import col, concat, current_timestamp, date_trunc, from_utc_timestamp

In [0]:
final_pit_stops_df = add_ingestion_date(pit_stops_df.withColumnRenamed("driverId", "driver_id") \
                                .withColumnRenamed("raceId", "race_id")) 



### De-duplicate the dataframe

In [0]:
pit_stops_deduped_df = final_pit_stops_df.dropDuplicates(["race_id", "driver_id"])

### write to parquet file

In [0]:
# overwrite_partition(final_pit_stops_df, "f1_processed", "pit_stops", "race_id")

merge_condition = "tgt.driver_id = src.driver_id AND tgt.race_id = src.race_id AND tgt.stop = src.stop"
merge_delta_data(pit_stops_deduped_df, "f1_processed", "pit_stops", PROCESSED_FOLDER_PATH, merge_condition, "race_id")

In [0]:
%sql 
SELECT race_id, COUNT(1)
FROM f1_processed.pit_stops
GROUP BY race_id
ORDER BY race_id DESC


In [0]:
dbutils.notebook.exit("Success")

In [0]:
display(spark.read.format("delta").load(f"{PROCESSED_FOLDER_PATH}/pit_stops"))