### Read multiple JSON files 

In [0]:
dbutils.widgets.text("param_data_source", "")
var_data_source = dbutils.widgets.get("param_data_source")

In [0]:
dbutils.widgets.text("param_file_date", "2021-03-28") # based on the name of the subfolder in blob storage
var_file_date = dbutils.widgets.get("param_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/utils"

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [0]:
qualifying_schema = StructType([StructField("qualifyId", IntegerType(), False), 
                              StructField("raceId", IntegerType(), True),
                              StructField("driverId", IntegerType(), True),
                              StructField("constructorId", IntegerType(), True),
                              StructField("number", IntegerType(), True),
                              StructField("position", IntegerType(), True),
                              StructField("q1", StringType(), True),
                              StructField("q2", StringType(), True),
                              StructField("q3", StringType(), True),
                            ])


qualifying_df = spark.read \
    .schema(qualifying_schema) \
    .option("multiline", True) \
    .json(f"{RAW_FOLDER_PATH}/{var_file_date}/qualifying")   


### rename columns and add new column

In [0]:
from pyspark.sql.functions import col, concat, current_timestamp, date_trunc, from_utc_timestamp

In [0]:
final_qualifying_df = add_ingestion_date(qualifying_df.withColumnRenamed("qualifyId", "qualify_id") \
                                .withColumnRenamed("driverId", "driver_id") \
                                .withColumnRenamed("raceId", "race_id") \
                                .withColumnRenamed("constructorId", "constructor_id"))


### write to parquet file

In [0]:
# overwrite_partition(final_qualifying_df, "f1_processed", "qualifying", "race_id")

merge_condition = "tgt.qualify_id = src.qualify_id AND tgt.race_id = src.race_id"
merge_delta_data(final_qualifying_df, "f1_processed", "qualifying", PROCESSED_FOLDER_PATH, merge_condition, "race_id")

In [0]:
%sql 
SELECT race_id, COUNT(1)
FROM f1_processed.qualifying
GROUP BY race_id
ORDER BY race_id DESC


In [0]:
dbutils.notebook.exit("Success")

In [0]:
display(spark.read.format("delta").load(f"{PROCESSED_FOLDER_PATH}/qualifying"))