
## Ingest lap_times folder


Step 1 - Read the CSV file using the spark dataframe API

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
lap_times_schema = StructType(fields=[
    StructField("raceId", IntegerType(), nullable=False),
    StructField("driverId", IntegerType(), nullable=True),
    StructField("lap", IntegerType(), nullable=True),
    StructField("position", IntegerType(), nullable=True),
    StructField("time", StringType(), nullable=True),
    StructField("milliseconds", IntegerType(), nullable=True)
])

In [0]:
lap_times_df = spark.read.format("csv")\
                         .schema(lap_times_schema)\
                         .load("/mnt/tideformula1dl/raw/lap_times")


#### Step 2 - Rename columns and add new columns
1. Rename driverId and raceId
2. Add ingestion_date with current timestamp

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
final_df = lap_times_df.withColumnRenamed("driverId", "driver_id")\
                      .withColumnRenamed("raceId", "race_id")\
                      .withColumn("ingestion_date",current_timestamp())


#### Step 3 - Write to output to processed container in parquet format

In [0]:
final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.lap_times")

In [0]:
dbutils.notebook.exit("Success")