In [0]:
%run "../includes/configuration"

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

In [0]:
name_schema = StructType(fields=[StructField("forename", StringType() , True),   
                                 StructField("surname", StringType(), True)])

In [0]:
driver_schema = StructType([StructField("driverId", IntegerType(), False), 
                            StructField("driverRef",StringType(), False),
                            StructField("number", IntegerType(), False),
                            StructField("code", StringType(), False),
                            StructField("name", name_schema, False),
                            StructField("dob", DateType(), False),
                            StructField("nationality", StringType(), False),
                            StructField("url", StringType(), False)])

In [0]:
drivers_df = spark.read \
    .schema(driver_schema)  \
    .json(f"{raw_folder_path}/drivers.json")

In [0]:
drivers_df.printSchema()

In [0]:
drivers_dropped_df = drivers_df.drop("url")

In [0]:
from pyspark.sql.functions import col, current_timestamp, lit, concat

In [0]:
v_source_file = dbutils.widgets.get("p_source_file")

In [0]:
drivers_final_df = drivers_dropped_df.withColumnRenamed("driverId","driver_id")   \
                                              .withColumnRenamed("driverRef","driver_ref")  \
                                              .withColumn("name", concat(col("name.forename"), lit(" "), col("name.surname")))  \
                                              .withColumn("data_source", lit(v_source_file))

In [0]:
%run "../functions/common_functions"

In [0]:
drivers_final_df = add_ingest_date(drivers_final_df)

In [0]:
drivers_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/drivers/")

In [0]:
spark.read.parquet(f"{processed_folder_path}/drivers/").display()

In [0]:
dbutils.notebook.exit("success")