### Ingest races.csv file

##### Step 1 - Read the CSV file using the spark dataframe reader API

In [1]:
v_data_source = "" #Provided by Data Factory

StatementMeta(, bfd7ec23-be32-4323-9b2a-1b1816226322, 3, Finished, Available)

In [2]:
%run configuration

StatementMeta(, , -1, Finished, Available)

In [3]:
%run common_functions

StatementMeta(, , -1, Finished, Available)

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

StatementMeta(, bfd7ec23-be32-4323-9b2a-1b1816226322, 10, Finished, Available)

In [5]:
races_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                  StructField("year", IntegerType(), True),
                                  StructField("round", IntegerType(), True),
                                  StructField("circuitId", IntegerType(), True),
                                  StructField("name", StringType(), True),
                                  StructField("date", DateType(), True),
                                  StructField("time", StringType(), True),
                                  StructField("url", StringType(), True) 
])

StatementMeta(, bfd7ec23-be32-4323-9b2a-1b1816226322, 11, Finished, Available)

In [6]:
races_df = spark.read \
.option("header", True) \
.schema(races_schema) \
.csv(f"{bronze_folder_path}/races.csv")
display(races_df)

StatementMeta(, bfd7ec23-be32-4323-9b2a-1b1816226322, 12, Finished, Available)

SynapseWidget(Synapse.DataFrame, 78bc608e-be5c-4f48-97f8-fad64b18d6ed)

##### Step 2 - Add ingestion date and race_timestamp to the dataframe

In [7]:
races_selected_df = races_df[["raceId", "year", "round", "circuitId", "name", "date", "time"]]
display(races_selected_df)

StatementMeta(, bfd7ec23-be32-4323-9b2a-1b1816226322, 13, Finished, Available)

SynapseWidget(Synapse.DataFrame, 43db7501-136a-4e3f-a70e-ab8182ec2718)

##### Step 4 - Add ingestion date to the dataframe

In [8]:
from pyspark.sql.functions import current_timestamp, to_timestamp, concat, col, lit

StatementMeta(, bfd7ec23-be32-4323-9b2a-1b1816226322, 14, Finished, Available)

In [9]:
races_with_timestamp_df = add_ingestion_date(races_df)

StatementMeta(, bfd7ec23-be32-4323-9b2a-1b1816226322, 15, Finished, Available)

In [10]:
races_with_timestamp_df = races_with_timestamp_df.withColumn("race_timestamp", to_timestamp(concat(col('date'), lit(' '), col('time')), 'yyyy-MM-dd HH:mm:ss')) \
                                                .withColumn("data_source", lit(v_data_source))
display(races_with_timestamp_df)

StatementMeta(, bfd7ec23-be32-4323-9b2a-1b1816226322, 16, Finished, Available)

SynapseWidget(Synapse.DataFrame, 95aea825-2c64-4df8-83cf-7f1298810e92)

##### Step 3 - Rename the columns as required

In [11]:
races_renamed_df = races_with_timestamp_df.withColumnRenamed("raceId", "race_id") \
.withColumnRenamed("year", "race_year") \
.withColumnRenamed("circuitId", "circuit_id")
display(races_renamed_df)

StatementMeta(, bfd7ec23-be32-4323-9b2a-1b1816226322, 17, Finished, Available)

SynapseWidget(Synapse.DataFrame, 860128fd-1d43-4f5e-bff4-7612e0103ee5)

In [15]:
races_final_df = races_renamed_df[["race_id", "race_year", "round", "circuit_id", "name", "ingestion_date", "race_timestamp", "data_source"]]
display(races_final_df)

StatementMeta(, bfd7ec23-be32-4323-9b2a-1b1816226322, 21, Finished, Available)

SynapseWidget(Synapse.DataFrame, 3b1a0ad7-6dcb-4bc2-a990-9835a8a0bf2f)

##### Step 5 - Write data to datalake as parquet

In [13]:
races_final_df.write.mode('overwrite').partitionBy('race_year').parquet(f"{silver_folder_path}/races")

StatementMeta(, bfd7ec23-be32-4323-9b2a-1b1816226322, 19, Finished, Available)

In [17]:
df_parquet = spark.read.parquet(f"{silver_folder_path}/races")
display(df_parquet)

StatementMeta(, bfd7ec23-be32-4323-9b2a-1b1816226322, 23, Finished, Available)

SynapseWidget(Synapse.DataFrame, cfb9a84d-e514-45a6-8c81-298212a97b09)