#### Step 1 - Ingest the multiple CSV files

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
#Let's import some packages
from pyspark.sql.types import *
from pyspark.sql.functions import *

laptimes_schema = StructType(fields = [
    StructField("raceId", IntegerType(), False),
    StructField("driverId", IntegerType(), True),
    StructField("lap", IntegerType(), True),
    StructField("position", IntegerType(), True),
    StructField("time", StringType(), True),
    StructField("milliseconds", IntegerType(), True)
])

# We use the wildcard * inplace of all the numbered files
laptimes_df = spark.read \
.schema(laptimes_schema) \
.csv(f"{raw_folder_path}/lap_times/lap_times_split*.csv")

In [0]:
display(laptimes_df.limit(10))

raceId,driverId,lap,position,time,milliseconds
841,20,1,1,1:38.109,98109
841,20,2,1,1:33.006,93006
841,20,3,1,1:32.713,92713
841,20,4,1,1:32.803,92803
841,20,5,1,1:32.342,92342
841,20,6,1,1:32.605,92605
841,20,7,1,1:32.502,92502
841,20,8,1,1:32.537,92537
841,20,9,1,1:33.240,93240
841,20,10,1,1:32.572,92572


#### Step 2 - Rename columns and add ingestion_date

In [0]:
final_df = add_ingestion_date(laptimes_df).withColumnRenamed("driverId", "driver_id") \
.withColumnRenamed("raceId", "race_id") 

#### Step 3 - Write data to datalake

In [0]:
final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/lap_times")

In [0]:
final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.lap_timesSQL")

#### Step 6 - Send exit statement for any dbutils.notebook.run cells

In [0]:
dbutils.notebook.exit("Success")

Success