
## Ingest circuits.csv file

In [0]:
%run "../includes/configuration"


#### Step 1 - Read the CSV file using the spark dataframe reader 

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [0]:
circuits_schema = StructType(fields=[
    StructField("circuitId", IntegerType(), nullable=False),
    StructField("circuitRef", StringType(), nullable=True),
    StructField("name", StringType(), nullable=True),
    StructField("location", StringType(), nullable=True),
    StructField("country", StringType(), nullable=True),
    StructField("lat", DoubleType(), nullable=True),
    StructField("lng", DoubleType(), nullable=True),
    StructField("alt", IntegerType(), nullable=True),
    StructField("url", StringType(), nullable=True)
])

In [0]:
circuits_df_path = f"{raw_folder_path}/circuits.csv"

circuits_df = spark.read.format("csv")\
                        .option("header", "true")\
                        .schema(schema=circuits_schema)\
                        .load(circuits_df_path)


#### Step 2 - Select only required column

In [0]:
# circuits_selected_df = circuits_df.select("circuitId", "circuitRef", "name", "location", "country", "lat", "lng", "alt")

In [0]:
# circuits_selected_df = circuits_df.select(circuits_df.circuitId, 
#                                           circuits_df.circuitRef, 
#                                           circuits_df.name, 
#                                           circuits_df.location, 
#                                           circuits_df.country,
#                                           circuits_df.lat, 
#                                           circuits_df.lng, 
#                                           circuits_df.alt
#                                           )

In [0]:
# circuits_selected_df = circuits_df.select(circuits_df["circuitId"], 
#                                           circuits_df["circuitRef"], 
#                                           circuits_df["name"], 
#                                           circuits_df["location"], 
#                                           circuits_df["country"],
#                                           circuits_df["lat"], 
#                                           circuits_df["lng"], 
#                                           circuits_df["alt"]
#                                           )

In [0]:
from pyspark.sql.functions import col

circuits_selected_df = circuits_df.select(col("circuitId"), 
                                          col("circuitRef"), 
                                          col("name"), 
                                          col("location"), 
                                          col("country").alias("race_country"),
                                          col("lat"), 
                                          col("lng"), 
                                          col("alt")
                                          )


#### Step 3 - Rename the columns as required

In [0]:
circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId", "circuit_id") \
                                          .withColumnRenamed("circuitRef", "circuit_ref") \
                                          .withColumnRenamed("lat", "latitude") \
                                          .withColumnRenamed("lng", "longitude") \
                                          .withColumnRenamed("alt", "altitude")


#### Step 4 - Add ingestion date to the dataframe

In [0]:
from pyspark.sql.functions import current_timestamp, lit

circuits_final_df = circuits_renamed_df.withColumn("ingestion_date", current_timestamp()) \
                                       .withColumn("env", lit("Production"))


#### Step 5 - Write data to datalake as parquet

In [0]:
circuits_final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.circuits")

In [0]:
dbutils.notebook.exit("Success")