## Import Libraries

In [1]:
# Import Necessary Libraries
import os
import hashlib
import urllib.request
import json
from datetime import timedelta, date

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

from delta.tables import DeltaTable

## Initiate Spark Session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DeltaLake") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()


23/10/10 11:58:14 WARN Utils: Your hostname, OASIS-CORP.local resolves to a loopback address: 127.0.0.1; using 192.168.225.160 instead (on interface en0)
23/10/10 11:58:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/oasis/sources/spark-3.2.0/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/oasis/.ivy2/cache
The jars for the packages stored in: /Users/oasis/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5b1ec802-ad70-45d5-9e07-aba01db19981;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.0.0 in central
	found io.delta#delta-storage;2.0.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 239ms :: artifacts dl 10ms
	:: modules in use:
	io.delta#delta-core_2.12;2.0.0 from central in [default]
	io.delta#delta-storage;2.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|ev

## Data Lake Paths

In [3]:
raw_layer = "s3a://oasiscorp-raw/formula-oasis"

## Deifine Schema

In [6]:
 # data in the name json 
name_schema = StructType([
    StructField("forename", StringType(), False),
    StructField("surname", StringType(), True)
])

# data in the drivers json
drivers_schema = StructType(fields=[
    StructField("driverId", IntegerType(), True),
    StructField("driverRef", StringType(), True),
    StructField("code", StringType(), True),
    StructField("name", name_schema),
    StructField("dob", DateType(), True),
    StructField("nationality", StringType(), True),
    StructField("url", StringType(), True)
])

## Read the data and specify the schema in datareader API

In [7]:
# Read the data from the raw layer
drivers_sdf = spark.read \
                .schema(drivers_schema) \
                .json(f"{raw_layer}/drivers.json")

23/10/10 12:02:54 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [9]:
drivers_sdf.show(10)

[Stage 0:>                                                          (0 + 1) / 1]

+--------+----------+----+--------------------+----------+-----------+--------------------+
|driverId| driverRef|code|                name|       dob|nationality|                 url|
+--------+----------+----+--------------------+----------+-----------+--------------------+
|       1|  hamilton| HAM|   {Lewis, Hamilton}|1985-01-07|    British|http://en.wikiped...|
|       2|  heidfeld| HEI|    {Nick, Heidfeld}|1977-05-10|     German|http://en.wikiped...|
|       3|   rosberg| ROS|     {Nico, Rosberg}|1985-06-27|     German|http://en.wikiped...|
|       4|    alonso| ALO|  {Fernando, Alonso}|1981-07-29|    Spanish|http://en.wikiped...|
|       5|kovalainen| KOV|{Heikki, Kovalainen}|1981-10-19|    Finnish|http://en.wikiped...|
|       6|  nakajima| NAK|  {Kazuki, Nakajima}|1985-01-11|   Japanese|http://en.wikiped...|
|       7|  bourdais| BOU|{Sébastien, Bourd...|1979-02-28|     French|http://en.wikiped...|
|       8| raikkonen| RAI|   {Kimi, Räikkönen}|1979-10-17|    Finnish|http://en.

                                                                                

In [11]:
## rename and add new dataframes

drivers_sdf = drivers_sdf.withColumnRenamed("driverId", "driver_id") \
                            .withColumnRenamed("driverRef", "driver_ref") \
                            .withColumn("ingested_date", current_timestamp()) \
                            .withColumn("name", concat(col("name.forename"), lit(" "), col("name.surname"))) 
drivers_sdf.show(10)

[Stage 1:>                                                          (0 + 1) / 1]

+---------+----------+----+------------------+----------+-----------+--------------------+--------------------+
|driver_id|driver_ref|code|              name|       dob|nationality|                 url|       ingested_date|
+---------+----------+----+------------------+----------+-----------+--------------------+--------------------+
|        1|  hamilton| HAM|    Lewis Hamilton|1985-01-07|    British|http://en.wikiped...|2023-10-10 12:07:...|
|        2|  heidfeld| HEI|     Nick Heidfeld|1977-05-10|     German|http://en.wikiped...|2023-10-10 12:07:...|
|        3|   rosberg| ROS|      Nico Rosberg|1985-06-27|     German|http://en.wikiped...|2023-10-10 12:07:...|
|        4|    alonso| ALO|   Fernando Alonso|1981-07-29|    Spanish|http://en.wikiped...|2023-10-10 12:07:...|
|        5|kovalainen| KOV| Heikki Kovalainen|1981-10-19|    Finnish|http://en.wikiped...|2023-10-10 12:07:...|
|        6|  nakajima| NAK|   Kazuki Nakajima|1985-01-11|   Japanese|http://en.wikiped...|2023-10-10 12:

                                                                                

In [12]:
drivers_sdf.columns

['driver_id',
 'driver_ref',
 'code',
 'name',
 'dob',
 'nationality',
 'url',
 'ingested_date']

In [13]:
# Rename columns


# Drop url column
drivers_sdf_final = drivers_sdf.drop(col("url"))

In [14]:
# ingested_date column 


drivers_sdf_final.show(10)

[Stage 2:>                                                          (0 + 1) / 1]

+---------+----------+----+------------------+----------+-----------+--------------------+
|driver_id|driver_ref|code|              name|       dob|nationality|       ingested_date|
+---------+----------+----+------------------+----------+-----------+--------------------+
|        1|  hamilton| HAM|    Lewis Hamilton|1985-01-07|    British|2023-10-10 12:08:...|
|        2|  heidfeld| HEI|     Nick Heidfeld|1977-05-10|     German|2023-10-10 12:08:...|
|        3|   rosberg| ROS|      Nico Rosberg|1985-06-27|     German|2023-10-10 12:08:...|
|        4|    alonso| ALO|   Fernando Alonso|1981-07-29|    Spanish|2023-10-10 12:08:...|
|        5|kovalainen| KOV| Heikki Kovalainen|1981-10-19|    Finnish|2023-10-10 12:08:...|
|        6|  nakajima| NAK|   Kazuki Nakajima|1985-01-11|   Japanese|2023-10-10 12:08:...|
|        7|  bourdais| BOU|Sébastien Bourdais|1979-02-28|     French|2023-10-10 12:08:...|
|        8| raikkonen| RAI|    Kimi Räikkönen|1979-10-17|    Finnish|2023-10-10 12:08:...|

                                                                                

## Write Data to S3 in parquet format

In [15]:
processed_layer = "s3a://oasiscorp-curated/formula-oasis"

In [16]:
# write in parquet format

drivers_sdf_final.write \
                .mode("overwrite") \
                    .parquet(f"{processed_layer}/drivers")

                                                                                

## Read Data Back From S3

In [17]:
# Read From Parquet

races_df = spark.read.parquet(f"{processed_layer}/drivers")
races_df.show(10)

[Stage 5:>                                                          (0 + 1) / 1]

+---------+----------+----+------------------+----------+-----------+--------------------+
|driver_id|driver_ref|code|              name|       dob|nationality|       ingested_date|
+---------+----------+----+------------------+----------+-----------+--------------------+
|        1|  hamilton| HAM|    Lewis Hamilton|1985-01-07|    British|2023-10-10 12:11:...|
|        2|  heidfeld| HEI|     Nick Heidfeld|1977-05-10|     German|2023-10-10 12:11:...|
|        3|   rosberg| ROS|      Nico Rosberg|1985-06-27|     German|2023-10-10 12:11:...|
|        4|    alonso| ALO|   Fernando Alonso|1981-07-29|    Spanish|2023-10-10 12:11:...|
|        5|kovalainen| KOV| Heikki Kovalainen|1981-10-19|    Finnish|2023-10-10 12:11:...|
|        6|  nakajima| NAK|   Kazuki Nakajima|1985-01-11|   Japanese|2023-10-10 12:11:...|
|        7|  bourdais| BOU|Sébastien Bourdais|1979-02-28|     French|2023-10-10 12:11:...|
|        8| raikkonen| RAI|    Kimi Räikkönen|1979-10-17|    Finnish|2023-10-10 12:11:...|

                                                                                