## Ingest races.csv file

1. Read the csv file using the spark dataframe reader

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampNTZType
from pyspark.sql.functions import col, lit, concat, to_timestamp

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
#dbutils.fs.ls('/mnt/formula1jf/raw')

In [0]:
races_schema = StructType(
    fields=[
        StructField("raceId", IntegerType(), False), 
        StructField("year", IntegerType(), True), 
        StructField("round", IntegerType(), True), 
        StructField("circuitId", IntegerType(), True), 
        StructField("name", StringType(), True), 
        StructField("date", DateType(), True),
        StructField("time", StringType(), True),
        StructField("url", StringType(), True)
    ]
)

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
races_df = spark.read.option("header", True).schema(races_schema).csv(f'{raw_folder_path}/races.csv')
#display(races_df)

2. Rename the columns

In [0]:
races_renamed_df = races_df.withColumnRenamed("raceId", "race_id") \
    .withColumnRenamed("year", "race_year") \
    .withColumnRenamed("circuitId", "circuit_id") \
    .withColumn("data_source", lit(v_data_source))

3. Add race timestamp and drop unnecessary columns

In [0]:
races_updated_df = races_renamed_df.withColumn("race_timestamp", to_timestamp(concat(col("date"), lit(" "), col("time")), "yyyy-MM-dd HH:mm:ss"))
races_updated_df = races_updated_df.drop("date", "time", "url")
#display(races_updated_df)
#races_updated_df.printSchema()

4. Add ingestion date to the dataframe

In [0]:
races_final_df = add_ingestion_date(races_updated_df)

5. Write data to datalake as parquet

In [0]:
#races_final_df.write.mode("overwrite").partitionBy('race_year').parquet(f"{processed_folder_path}/races")

In [0]:
# Save to database
races_final_df.write.mode("overwrite").partitionBy('race_year').format("parquet").saveAsTable("f1_processed.races")

In [0]:
# %fs
# ls mnt/formula1jf/processed/races

In [0]:
dbutils.notebook.exit("Success")