## Import Libraries

In [16]:
# Import Necessary Libraries
import os
import hashlib
import urllib.request
import json
from datetime import timedelta, date

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

from delta.tables import DeltaTable

## Initiate Spark Session

In [17]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DeltaLake") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()


## Data Lake Paths

In [18]:
raw_layer = "s3a://oasiscorp-raw/formula-oasis"

## Deifine Schema

In [19]:
races_schema = StructType(fields=[StructField("raceID", StringType(), False),
                                  StructField("year", StringType(), True),
                                  StructField("round", StringType(), True),
                                    StructField("circuitId", StringType(), True),
                                    StructField("name", StringType(), True),
                                    StructField("date", StringType(), True),
                                    StructField("time", StringType(), True),
                                    StructField("url", StringType(), True)])

## Read the data and specify the schema in datareader API

In [20]:
# Read the data from the raw layer
races_sdf = spark.read \
                .option("header", "true") \
                .schema(races_schema) \
                .csv(f"{raw_layer}/races.csv")

In [21]:
races_sdf.show(10)

+------+----+-----+---------+--------------------+----------+--------+--------------------+
|raceID|year|round|circuitId|                name|      date|    time|                 url|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00|http://en.wikiped...|
|     6|2009|    6|        6|   Monaco Grand Prix|2009-05-24|12:00:00|http://en.wikiped...|
|     7|2009|    7|        5|  Turkish Grand Prix|2009-06-07|12:00:00|http://en.wikiped...|
|     8|2009|    8|        9|  British Grand Prix|2009-06-21|12:00:00|http://en.

                                                                                

In [22]:
races_sdf.columns

['raceID', 'year', 'round', 'circuitId', 'name', 'date', 'time', 'url']

In [23]:
# Assuming you have a DataFrame named 'races_sdf'
races_sdf_curated = races_sdf.withColumn("ingestion_timestamp", current_timestamp()) \
                                .withColumn("race_timestamp", to_timestamp(concat(col("date"), lit(" "), col("time")), 'yyyy-MM-dd HH:mm:ss'))

races_sdf_curated.show(10)

+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+-------------------+
|raceID|year|round|circuitId|                name|      date|    time|                 url| ingestion_timestamp|     race_timestamp|
+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+-------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|2023-10-10 14:52:...|2009-03-29 06:00:00|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|2023-10-10 14:52:...|2009-04-05 09:00:00|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|2023-10-10 14:52:...|2009-04-19 07:00:00|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|2023-10-10 14:52:...|2009-04-26 12:00:00|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00

                                                                                

In [24]:
from pyspark.sql.functions import col

races_sdf_select = races_sdf_curated.select(
    col("raceID").alias("race_id"),
    col("year").alias("race_year"),
    col("circuitId").alias("circuit_id"),
    col("round"),
    col("name"),
    col("ingestion_timestamp").alias("ingested_date"),
    col("race_timestamp")
)

races_sdf_select.show(10)

[Stage 12:>                                                         (0 + 1) / 1]

+-------+---------+----------+-----+--------------------+--------------------+-------------------+
|race_id|race_year|circuit_id|round|                name|       ingested_date|     race_timestamp|
+-------+---------+----------+-----+--------------------+--------------------+-------------------+
|      1|     2009|         1|    1|Australian Grand ...|2023-10-10 14:52:...|2009-03-29 06:00:00|
|      2|     2009|         2|    2|Malaysian Grand Prix|2023-10-10 14:52:...|2009-04-05 09:00:00|
|      3|     2009|        17|    3|  Chinese Grand Prix|2023-10-10 14:52:...|2009-04-19 07:00:00|
|      4|     2009|         3|    4|  Bahrain Grand Prix|2023-10-10 14:52:...|2009-04-26 12:00:00|
|      5|     2009|         4|    5|  Spanish Grand Prix|2023-10-10 14:52:...|2009-05-10 12:00:00|
|      6|     2009|         6|    6|   Monaco Grand Prix|2023-10-10 14:52:...|2009-05-24 12:00:00|
|      7|     2009|         5|    7|  Turkish Grand Prix|2023-10-10 14:52:...|2009-06-07 12:00:00|
|      8| 

                                                                                

In [25]:
# print Schema 

races_sdf_select.printSchema()

root
 |-- race_id: string (nullable = true)
 |-- race_year: string (nullable = true)
 |-- circuit_id: string (nullable = true)
 |-- round: string (nullable = true)
 |-- name: string (nullable = true)
 |-- ingested_date: timestamp (nullable = false)
 |-- race_timestamp: timestamp (nullable = true)



## Write Data to S3 in parquet format

In [26]:
processed_layer = "s3a://oasiscorp-curated/formula-oasis"

In [27]:
# Write into the processed layer
races_sdf_select.write \
                .mode("overwrite") \
                    .parquet(f"{processed_layer}/races")

                                                                                

## Read Data Back From S3

In [28]:
# Read From Parquet

races_df = spark.read.parquet(f"{processed_layer}/races")
races_df.show(10)

[Stage 15:>                                                         (0 + 1) / 1]

+-------+---------+----------+-----+--------------------+--------------------+-------------------+
|race_id|race_year|circuit_id|round|                name|       ingested_date|     race_timestamp|
+-------+---------+----------+-----+--------------------+--------------------+-------------------+
|      1|     2009|         1|    1|Australian Grand ...|2023-10-10 15:08:...|2009-03-29 06:00:00|
|      2|     2009|         2|    2|Malaysian Grand Prix|2023-10-10 15:08:...|2009-04-05 09:00:00|
|      3|     2009|        17|    3|  Chinese Grand Prix|2023-10-10 15:08:...|2009-04-19 07:00:00|
|      4|     2009|         3|    4|  Bahrain Grand Prix|2023-10-10 15:08:...|2009-04-26 12:00:00|
|      5|     2009|         4|    5|  Spanish Grand Prix|2023-10-10 15:08:...|2009-05-10 12:00:00|
|      6|     2009|         6|    6|   Monaco Grand Prix|2023-10-10 15:08:...|2009-05-24 12:00:00|
|      7|     2009|         5|    7|  Turkish Grand Prix|2023-10-10 15:08:...|2009-06-07 12:00:00|
|      8| 

                                                                                

In [29]:
# grab distinct race years 
races_df.select("race_year").distinct().show()

[Stage 16:>                                                         (0 + 1) / 1]

+---------+
|race_year|
+---------+
|     1953|
|     1957|
|     1987|
|     1956|
|     2016|
|     2012|
|     2020|
|     1958|
|     1972|
|     1988|
|     2019|
|     2017|
|     1977|
|     1971|
|     2014|
|     1984|
|     1982|
|     2013|
|     2005|
|     2000|
+---------+
only showing top 20 rows



                                                                                