## Import Libraries

In [1]:
# Import Necessary Libraries
import os
import hashlib
import urllib.request
import json
from datetime import timedelta, date

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

from delta.tables import DeltaTable

## Initiate Spark Session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DeltaLake") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()




:: loading settings :: url = jar:file:/Users/oasis/sources/spark-3.2.0/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/oasis/.ivy2/cache
The jars for the packages stored in: /Users/oasis/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ef230768-ec38-4527-85b8-297dfa613df1;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.0.0 in central
	found io.delta#delta-storage;2.0.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 242ms :: artifacts dl 10ms
	:: modules in use:
	io.delta#delta-core_2.12;2.0.0 from central in [default]
	io.delta#delta-storage;2.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|ev

## Data Lake Paths

In [3]:
raw_layer = "s3a://oasiscorp-raw/formula-oasis"

## Deifine Schema

In [4]:
cons_schema = StructType(fields=[StructField("constructorId", IntegerType(), True),
                                  StructField("constructorRef", StringType(), True),
                                  StructField("name", StringType(), True),
                                  StructField("nationality", StringType(), True),
                                  StructField("url", StringType(), True)])

## Read the data and specify the schema in datareader API

In [5]:
# Read the data from the raw layer
cons_sdf = spark.read \
                .schema(cons_schema) \
                .json(f"{raw_layer}/constructors.json")

23/10/10 11:45:34 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [6]:
cons_sdf.show(10)

[Stage 0:>                                                          (0 + 1) / 1]

+-------------+--------------+-----------+-----------+--------------------+
|constructorId|constructorRef|       name|nationality|                 url|
+-------------+--------------+-----------+-----------+--------------------+
|            1|       mclaren|    McLaren|    British|http://en.wikiped...|
|            2|    bmw_sauber| BMW Sauber|     German|http://en.wikiped...|
|            3|      williams|   Williams|    British|http://en.wikiped...|
|            4|       renault|    Renault|     French|http://en.wikiped...|
|            5|    toro_rosso| Toro Rosso|    Italian|http://en.wikiped...|
|            6|       ferrari|    Ferrari|    Italian|http://en.wikiped...|
|            7|        toyota|     Toyota|   Japanese|http://en.wikiped...|
|            8|   super_aguri|Super Aguri|   Japanese|http://en.wikiped...|
|            9|      red_bull|   Red Bull|   Austrian|http://en.wikiped...|
|           10|   force_india|Force India|     Indian|http://en.wikiped...|
+-----------

                                                                                

In [7]:
cons_sdf.columns

['constructorId', 'constructorRef', 'name', 'nationality', 'url']

In [9]:
# Rename columns
cons_sdf = cons_sdf.withColumnRenamed("constructorId", "constructor_id") \
                   .withColumnRenamed("constructorRef", "constructor_ref")

# Drop url column
cons_sdf = cons_sdf.drop(col("url"))

[Stage 2:>                                                          (0 + 1) / 1]

+--------------+---------------+-----------+-----------+-------------+
|constructor_id|constructor_ref|       name|nationality|ingested_date|
+--------------+---------------+-----------+-----------+-------------+
|             1|        mclaren|    McLaren|    British|   2023-10-10|
|             2|     bmw_sauber| BMW Sauber|     German|   2023-10-10|
|             3|       williams|   Williams|    British|   2023-10-10|
|             4|        renault|    Renault|     French|   2023-10-10|
|             5|     toro_rosso| Toro Rosso|    Italian|   2023-10-10|
|             6|        ferrari|    Ferrari|    Italian|   2023-10-10|
|             7|         toyota|     Toyota|   Japanese|   2023-10-10|
|             8|    super_aguri|Super Aguri|   Japanese|   2023-10-10|
|             9|       red_bull|   Red Bull|   Austrian|   2023-10-10|
|            10|    force_india|Force India|     Indian|   2023-10-10|
+--------------+---------------+-----------+-----------+-------------+
only s

                                                                                

In [11]:
# ingested_date column 
cons_sdf = cons_sdf.withColumn("ingested_date", current_timestamp())

cons_sdf.show(10)

[Stage 3:>                                                          (0 + 1) / 1]

+--------------+---------------+-----------+-----------+--------------------+
|constructor_id|constructor_ref|       name|nationality|       ingested_date|
+--------------+---------------+-----------+-----------+--------------------+
|             1|        mclaren|    McLaren|    British|2023-10-10 11:54:...|
|             2|     bmw_sauber| BMW Sauber|     German|2023-10-10 11:54:...|
|             3|       williams|   Williams|    British|2023-10-10 11:54:...|
|             4|        renault|    Renault|     French|2023-10-10 11:54:...|
|             5|     toro_rosso| Toro Rosso|    Italian|2023-10-10 11:54:...|
|             6|        ferrari|    Ferrari|    Italian|2023-10-10 11:54:...|
|             7|         toyota|     Toyota|   Japanese|2023-10-10 11:54:...|
|             8|    super_aguri|Super Aguri|   Japanese|2023-10-10 11:54:...|
|             9|       red_bull|   Red Bull|   Austrian|2023-10-10 11:54:...|
|            10|    force_india|Force India|     Indian|2023-10-

                                                                                

## Write Data to S3 in parquet format

In [12]:
processed_layer = "s3a://oasiscorp-curated/formula-oasis"

In [13]:
# write in parquet format

cons_sdf.write \
                .mode("overwrite") \
                    .parquet(f"{processed_layer}/constructors")

                                                                                

## Read Data Back From S3

In [14]:
# Read From Parquet

races_df = spark.read.parquet(f"{processed_layer}/constructors")
races_df.show(10)

[Stage 6:>                                                          (0 + 1) / 1]

+--------------+---------------+-----------+-----------+--------------------+
|constructor_id|constructor_ref|       name|nationality|       ingested_date|
+--------------+---------------+-----------+-----------+--------------------+
|             1|        mclaren|    McLaren|    British|2023-10-10 11:55:...|
|             2|     bmw_sauber| BMW Sauber|     German|2023-10-10 11:55:...|
|             3|       williams|   Williams|    British|2023-10-10 11:55:...|
|             4|        renault|    Renault|     French|2023-10-10 11:55:...|
|             5|     toro_rosso| Toro Rosso|    Italian|2023-10-10 11:55:...|
|             6|        ferrari|    Ferrari|    Italian|2023-10-10 11:55:...|
|             7|         toyota|     Toyota|   Japanese|2023-10-10 11:55:...|
|             8|    super_aguri|Super Aguri|   Japanese|2023-10-10 11:55:...|
|             9|       red_bull|   Red Bull|   Austrian|2023-10-10 11:55:...|
|            10|    force_india|Force India|     Indian|2023-10-

                                                                                