In [0]:
%run "../../config/config"

In [0]:
%sql
USE dev_bronze_f1_catalog.raw_formula1_schema

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

# Define explicit schema for the circuits data
circuit_schema = StructType([
    StructField("circuitId",   IntegerType(),   nullable=False),
    StructField("circuitRef",  StringType(),    nullable=True),
    StructField("name",        StringType(),    nullable=True),
    StructField("location",    StringType(),    nullable=True),
    StructField("country",     StringType(),    nullable=True),
    StructField("lat",         DoubleType(),    nullable=True),
    StructField("lng",         DoubleType(),    nullable=True),
    StructField("alt",         IntegerType(),   nullable=True),
    StructField("url",         StringType(),    nullable=True)
])


In [0]:
dbutils.widgets.text("p_file_date", "")
v_file_date = dbutils.widgets.get("p_file_date")
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

df_data = (
    spark.read
    .option("header", True)
    .schema(circuit_schema)
    .csv(f"{raw_folder_path}/{v_file_date}/circuits.csv")
)

In [0]:
display(df_data.count())

In [0]:
from pyspark.sql.functions import lit
df_data = df_data.withColumn("data_source", lit(v_data_source)).withColumn("file_date", lit(v_file_date))

In [0]:
# Write df_data to Parquet
processed_path = f"{processed_folder_path}/{v_file_date}/circuits"

# writ eand read Read from Parquet and display first 5 records
df_data.write.mode("overwrite").parquet(processed_path)
df_parquet = spark.read.parquet(processed_path)
display(df_parquet.limit(5))

In [0]:
current_schema = spark.sql("SELECT current_database()").collect()[0][0]
display(spark.sql(f"SHOW TABLES IN {current_schema}"))

In [0]:
# saved delta table at external location
df_data.write.mode("overwrite").format("delta").option("path", processed_path).saveAsTable("f1_processed_circuits")

In [0]:
%sql DESCRIBE EXTENDED f1_processed_circuits

In [0]:
# saved delta table as managed table
df_data.write.mode("overwrite").format("delta").saveAsTable("f1_processed_circuits_m")

In [0]:
%sql DESCRIBE EXTENDED f1_processed_circuits_m