In [1]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [16]:
from pyspark.sql.types import StructType, StructField,IntegerType, StringType, DateType

In [17]:
name_schema = StructType(fields=[StructField("forename",StringType(),True),
                                 StructField("surname",StringType(),True)                      
                            
                                ])

In [18]:
drivers_schema = StructType(fields=[StructField("driverId",IntegerType(),False),
                                 StructField("driverRef",StringType(),True),   
                                StructField("number",IntegerType(),True), 
                                    StructField("code",StringType(),True), 
                                    StructField("name",name_schema), 
                                    StructField("dob",DateType(),True), 
                                    StructField("nationality",StringType(),True), 
                                    StructField("url",StringType(),True), 
                            
                                ])

In [20]:
drivers_df = spark.read\
.schema(drivers_schema)\
.json('C:/Users/ACER/Documents/Burgo Juan/Udemi - Azure Databricks/drivers.json')

In [21]:
drivers_df.show()

+--------+----------+------+----+--------------------+----------+-----------+--------------------+
|driverId| driverRef|number|code|                name|       dob|nationality|                 url|
+--------+----------+------+----+--------------------+----------+-----------+--------------------+
|       1|  hamilton|    44| HAM|   {Lewis, Hamilton}|1985-01-07|    British|http://en.wikiped...|
|       2|  heidfeld|  null| HEI|    {Nick, Heidfeld}|1977-05-10|     German|http://en.wikiped...|
|       3|   rosberg|     6| ROS|     {Nico, Rosberg}|1985-06-27|     German|http://en.wikiped...|
|       4|    alonso|    14| ALO|  {Fernando, Alonso}|1981-07-29|    Spanish|http://en.wikiped...|
|       5|kovalainen|  null| KOV|{Heikki, Kovalainen}|1981-10-19|    Finnish|http://en.wikiped...|
|       6|  nakajima|  null| NAK|  {Kazuki, Nakajima}|1985-01-11|   Japanese|http://en.wikiped...|
|       7|  bourdais|  null| BOU|{Sébastien, Bourd...|1979-02-28|     French|http://en.wikiped...|
|       8|

### Rename Columns and add nex columns

In [26]:
from pyspark.sql.functions import col, concat, current_timestamp,lit

In [27]:
drivers_with_columns_df= drivers_df.withColumnRenamed("driverId","driver_id")\
                                    .withColumnRenamed("driverRef","driver_ref")\
                                    .withColumn("ingestion_date",current_timestamp())\
                                    .withColumn("name",concat(col("name.forename"),lit(" "),col("name.surname")))

In [28]:
drivers_with_columns_df.show()

+---------+----------+------+----+------------------+----------+-----------+--------------------+--------------------+
|driver_id|driver_ref|number|code|              name|       dob|nationality|                 url|      ingestion_date|
+---------+----------+------+----+------------------+----------+-----------+--------------------+--------------------+
|        1|  hamilton|    44| HAM|    Lewis Hamilton|1985-01-07|    British|http://en.wikiped...|2022-09-06 14:01:...|
|        2|  heidfeld|  null| HEI|     Nick Heidfeld|1977-05-10|     German|http://en.wikiped...|2022-09-06 14:01:...|
|        3|   rosberg|     6| ROS|      Nico Rosberg|1985-06-27|     German|http://en.wikiped...|2022-09-06 14:01:...|
|        4|    alonso|    14| ALO|   Fernando Alonso|1981-07-29|    Spanish|http://en.wikiped...|2022-09-06 14:01:...|
|        5|kovalainen|  null| KOV| Heikki Kovalainen|1981-10-19|    Finnish|http://en.wikiped...|2022-09-06 14:01:...|
|        6|  nakajima|  null| NAK|   Kazuki Naka

In [31]:
drivers_final_df= drivers_with_columns_df.drop(col("url"))
drivers_final_df.show()

+---------+----------+------+----+------------------+----------+-----------+--------------------+
|driver_id|driver_ref|number|code|              name|       dob|nationality|      ingestion_date|
+---------+----------+------+----+------------------+----------+-----------+--------------------+
|        1|  hamilton|    44| HAM|    Lewis Hamilton|1985-01-07|    British|2022-09-06 14:08:...|
|        2|  heidfeld|  null| HEI|     Nick Heidfeld|1977-05-10|     German|2022-09-06 14:08:...|
|        3|   rosberg|     6| ROS|      Nico Rosberg|1985-06-27|     German|2022-09-06 14:08:...|
|        4|    alonso|    14| ALO|   Fernando Alonso|1981-07-29|    Spanish|2022-09-06 14:08:...|
|        5|kovalainen|  null| KOV| Heikki Kovalainen|1981-10-19|    Finnish|2022-09-06 14:08:...|
|        6|  nakajima|  null| NAK|   Kazuki Nakajima|1985-01-11|   Japanese|2022-09-06 14:08:...|
|        7|  bourdais|  null| BOU|Sébastien Bourdais|1979-02-28|     French|2022-09-06 14:08:...|
|        8| raikkone

### Write to output to processed container in parque format

In [33]:
drivers_final_df.write.mode("overwrite").parquet('C:/Users/ACER/Documents/Burgo Juan/Udemi - Azure Databricks/drivers')

In [34]:
df= spark.read.parquet('C:/Users/ACER/Documents/Burgo Juan/Udemi - Azure Databricks/drivers')

In [35]:
df.show()

+---------+----------+------+----+------------------+----------+-----------+--------------------+
|driver_id|driver_ref|number|code|              name|       dob|nationality|      ingestion_date|
+---------+----------+------+----+------------------+----------+-----------+--------------------+
|        1|  hamilton|    44| HAM|    Lewis Hamilton|1985-01-07|    British|2022-09-06 14:10:...|
|        2|  heidfeld|  null| HEI|     Nick Heidfeld|1977-05-10|     German|2022-09-06 14:10:...|
|        3|   rosberg|     6| ROS|      Nico Rosberg|1985-06-27|     German|2022-09-06 14:10:...|
|        4|    alonso|    14| ALO|   Fernando Alonso|1981-07-29|    Spanish|2022-09-06 14:10:...|
|        5|kovalainen|  null| KOV| Heikki Kovalainen|1981-10-19|    Finnish|2022-09-06 14:10:...|
|        6|  nakajima|  null| NAK|   Kazuki Nakajima|1985-01-11|   Japanese|2022-09-06 14:10:...|
|        7|  bourdais|  null| BOU|Sébastien Bourdais|1979-02-28|     French|2022-09-06 14:10:...|
|        8| raikkone