### Read a set of CSV files

In [0]:
dbutils.widgets.text("param_data_source", "")
var_data_source = dbutils.widgets.get("param_data_source")

In [0]:
dbutils.widgets.text("param_file_date", "2021-03-28") # based on the name of the subfolder in blob storage
var_file_date = dbutils.widgets.get("param_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/utils"

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType 

In [0]:
lap_time_schema = StructType([StructField("raceId", IntegerType(), False), 
                              StructField("driverId", IntegerType(), True),
                              StructField("lap", IntegerType(), True),
                              StructField("position", IntegerType(), True),
                              StructField("time", StringType(), True),
                              StructField("milliseconds", IntegerType(), True),
                            ])


lap_times_df = spark.read \
    .schema(lap_time_schema) \
    .csv(f"{RAW_FOLDER_PATH}/{var_file_date}/lap_times") # read the whole folder   


In [0]:
lap_times_df.count()

### rename columns and add new column

In [0]:
from pyspark.sql.functions import col, concat, current_timestamp, date_trunc, from_utc_timestamp

In [0]:
final_lap_times_df = add_ingestion_date(lap_times_df.withColumnRenamed("driverId", "driver_id") \
                                .withColumnRenamed("raceId", "race_id"))



### write to parquet file

In [0]:
# overwrite_partition(final_lap_times_df, "f1_processed", "lap_times", "race_id")

merge_condition = "tgt.driver_id = src.driver_id AND tgt.race_id = src.race_id AND tgt.lap = src.lap"
merge_delta_data(final_lap_times_df, "f1_processed", "lap_times", PROCESSED_FOLDER_PATH, merge_condition, "race_id")

In [0]:
%sql 
SELECT race_id, COUNT(1)
FROM f1_processed.lap_times
GROUP BY race_id
ORDER BY race_id DESC


In [0]:
dbutils.notebook.exit("Success")

In [0]:
display(spark.read.format("delta").load(f"{PROCESSED_FOLDER_PATH}/lap_times"))