In [1]:
import findspark
findspark.init

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

### Read Csv file using dataframe reader API

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [5]:
qualifying_schema = StructType(fields=[StructField("qualifyId",IntegerType(),False),
                                      StructField("raceId",IntegerType(),True),
                                      StructField("driverId",IntegerType(),True),
                                      StructField("constructorId",IntegerType(),True),
                                      StructField("number",IntegerType(),True),
                                      StructField("position",IntegerType(),True),
                                        StructField("q1",StringType(),True),
                                        StructField("q2",StringType(),True),
                                        StructField("q3",StringType(),True),
    
])

In [6]:
qualifying_df = spark.read.schema(qualifying_schema).option("multiLine",True).json('C:/Users/ACER/Documents/Burgo Juan/Udemi - Azure Databricks/qualifying')

In [7]:
qualifying_df.show()

+---------+------+--------+-------------+------+--------+--------+--------+--------+
|qualifyId|raceId|driverId|constructorId|number|position|      q1|      q2|      q3|
+---------+------+--------+-------------+------+--------+--------+--------+--------+
|        1|    18|       1|            1|    22|       1|1:26.572|1:25.187|1:26.714|
|        2|    18|       9|            2|     4|       2|1:26.103|1:25.315|1:26.869|
|        3|    18|       5|            1|    23|       3|1:25.664|1:25.452|1:27.079|
|        4|    18|      13|            6|     2|       4|1:25.994|1:25.691|1:27.178|
|        5|    18|       2|            2|     3|       5|1:25.960|1:25.518|1:27.236|
|        6|    18|      15|            7|    11|       6|1:26.427|1:26.101|1:28.527|
|        7|    18|       3|            3|     7|       7|1:26.295|1:26.059|1:28.687|
|        8|    18|      14|            9|     9|       8|1:26.381|1:26.063|1:29.041|
|        9|    18|      10|            7|    12|       9|1:26.919

### Rename columns and add new columns  

In [8]:
from pyspark.sql.functions import current_timestamp, col

In [9]:
final_df=qualifying_df.withColumnRenamed("qualifyId","qualify_id")\
.withColumnRenamed("driverId","driver_id")\
.withColumnRenamed("raceId","race_id")\
.withColumnRenamed("constructorId","constructor_id")\
.withColumn("ingestion_date",current_timestamp())

In [10]:
final_df.show()

+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
|qualify_id|race_id|driver_id|constructor_id|number|position|      q1|      q2|      q3|      ingestion_date|
+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
|         1|     18|        1|             1|    22|       1|1:26.572|1:25.187|1:26.714|2022-09-08 12:32:...|
|         2|     18|        9|             2|     4|       2|1:26.103|1:25.315|1:26.869|2022-09-08 12:32:...|
|         3|     18|        5|             1|    23|       3|1:25.664|1:25.452|1:27.079|2022-09-08 12:32:...|
|         4|     18|       13|             6|     2|       4|1:25.994|1:25.691|1:27.178|2022-09-08 12:32:...|
|         5|     18|        2|             2|     3|       5|1:25.960|1:25.518|1:27.236|2022-09-08 12:32:...|
|         6|     18|       15|             7|    11|       6|1:26.427|1:26.101|1:28.527|2022-09-08 12:32:...|
|         

### Write to output to processed container in parquet format

In [11]:
final_df.write.mode("overwrite").parquet ('C:/Users/ACER/Documents/Burgo Juan/Udemi - Azure Databricks/qualifying_parquet')

In [12]:
spark.read.parquet('C:/Users/ACER/Documents/Burgo Juan/Udemi - Azure Databricks/qualifying_parquet').show()

+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
|qualify_id|race_id|driver_id|constructor_id|number|position|      q1|      q2|      q3|      ingestion_date|
+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
|         1|     18|        1|             1|    22|       1|1:26.572|1:25.187|1:26.714|2022-09-08 12:33:...|
|         2|     18|        9|             2|     4|       2|1:26.103|1:25.315|1:26.869|2022-09-08 12:33:...|
|         3|     18|        5|             1|    23|       3|1:25.664|1:25.452|1:27.079|2022-09-08 12:33:...|
|         4|     18|       13|             6|     2|       4|1:25.994|1:25.691|1:27.178|2022-09-08 12:33:...|
|         5|     18|        2|             2|     3|       5|1:25.960|1:25.518|1:27.236|2022-09-08 12:33:...|
|         6|     18|       15|             7|    11|       6|1:26.427|1:26.101|1:28.527|2022-09-08 12:33:...|
|         