In [1]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DateType

In [20]:
races_schema = StructType(fields=[StructField("raceId",IntegerType(),False),
                                  StructField("year",IntegerType(),True),
                                  StructField("round",IntegerType(),True),
                                StructField("circuitId",IntegerType(),True),
                                  StructField("name",StringType(),True),
                                   StructField("date",DateType(),True),
                                  StructField("time",StringType(),True),
                                  StructField("url",StringType(),True)])
#False es para que no pueda tener valores nulos
#Otra forma de hacerlo seria:
#races_schema = "raceId INT,year INT,round INT, circuitId INT,name STRING, date DATE, time STRING, url STRING"
                               

In [21]:
races_df= spark.read.option("header",True).schema(races_schema).csv('C:/Users/ACER/Documents/Burgo Juan/Udemi - Azure Databricks/races.csv')

In [22]:
display(races_df)

DataFrame[raceId: int, year: int, round: int, circuitId: int, name: string, date: date, time: string, url: string]

In [5]:
races_df.show()

+------+----+-----+---------+--------------------+----------+--------+--------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00|http://en.wikiped...|
|     6|2009|    6|        6|   Monaco Grand Prix|2009-05-24|12:00:00|http://en.wikiped...|
|     7|2009|    7|        5|  Turkish Grand Prix|2009-06-07|12:00:00|http://en.wikiped...|
|     8|2009|    8|        9|  British Grand Prix|2009-06-21|12:00:00|http://en.

In [6]:
from pyspark.sql.functions import current_timestamp, to_timestamp,col,lit,concat

In [7]:
races_with_timestamp_df = races_df.withColumn("ingestion_date",current_timestamp()).withColumn("race_timestamp",to_timestamp(concat(col("date"),lit(" "),col("time")),"yyyy-MM-dd HH:mm:ss"))

In [8]:
display(races_with_timestamp_df)

DataFrame[raceId: int, year: int, round: int, circuitId: int, name: string, date: date, time: string, url: string, ingestion_date: timestamp, race_timestamp: timestamp]

In [9]:
races_with_timestamp_df.show()

+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+-------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|      ingestion_date|     race_timestamp|
+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+-------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|2022-08-26 11:57:...|2009-03-29 06:00:00|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|2022-08-26 11:57:...|2009-04-05 09:00:00|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|2022-08-26 11:57:...|2009-04-19 07:00:00|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|2022-08-26 11:57:...|2009-04-26 12:00:00|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00

In [10]:
races_selected_df = races_with_timestamp_df.select(col("raceId").alias("race_id"),col("year").alias("race_year"),col("round"),col("circuitId").alias("circuit_id"),col("name"),col("ingestion_date"),col("race_timestamp"))

In [11]:
races_selected_df.show()

+-------+---------+-----+----------+--------------------+--------------------+-------------------+
|race_id|race_year|round|circuit_id|                name|      ingestion_date|     race_timestamp|
+-------+---------+-----+----------+--------------------+--------------------+-------------------+
|      1|     2009|    1|         1|Australian Grand ...|2022-08-26 11:57:...|2009-03-29 06:00:00|
|      2|     2009|    2|         2|Malaysian Grand Prix|2022-08-26 11:57:...|2009-04-05 09:00:00|
|      3|     2009|    3|        17|  Chinese Grand Prix|2022-08-26 11:57:...|2009-04-19 07:00:00|
|      4|     2009|    4|         3|  Bahrain Grand Prix|2022-08-26 11:57:...|2009-04-26 12:00:00|
|      5|     2009|    5|         4|  Spanish Grand Prix|2022-08-26 11:57:...|2009-05-10 12:00:00|
|      6|     2009|    6|         6|   Monaco Grand Prix|2022-08-26 11:57:...|2009-05-24 12:00:00|
|      7|     2009|    7|         5|  Turkish Grand Prix|2022-08-26 11:57:...|2009-06-07 12:00:00|
|      8| 

In [12]:
races_selected_df.write.mode("overwrite").partitionBy("race_year").parquet('C:/Users/ACER/Documents/Burgo Juan/Udemi - Azure Databricks/races')

In [13]:
df=spark.read.parquet('C:/Users/ACER/Documents/Burgo Juan/Udemi - Azure Databricks/races')

In [14]:
df.show()

+-------+-----+----------+--------------------+--------------------+-------------------+---------+
|race_id|round|circuit_id|                name|      ingestion_date|     race_timestamp|race_year|
+-------+-----+----------+--------------------+--------------------+-------------------+---------+
|   1053|    2|        21|Emilia Romagna Gr...|2022-08-26 11:57:...|2021-04-18 13:00:00|     2021|
|   1052|    1|         3|  Bahrain Grand Prix|2022-08-26 11:57:...|2021-03-28 15:00:00|     2021|
|   1051|   21|         1|Australian Grand ...|2022-08-26 11:57:...|2021-11-21 06:00:00|     2021|
|   1054|    3|        20|                 TBC|2022-08-26 11:57:...|               null|     2021|
|   1055|    4|         4|  Spanish Grand Prix|2022-08-26 11:57:...|2021-05-09 13:00:00|     2021|
|   1056|    5|         6|   Monaco Grand Prix|2022-08-26 11:57:...|2021-05-23 13:00:00|     2021|
|   1057|    6|        73|Azerbaijan Grand ...|2022-08-26 11:57:...|2021-06-06 12:00:00|     2021|
|   1058| 