## Import Libraries

In [2]:
# Import Necessary Libraries
import os
import hashlib
import urllib.request
import json
from datetime import timedelta, date

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

from delta.tables import DeltaTable

## Initiate Spark Session

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DeltaLake") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()




:: loading settings :: url = jar:file:/Users/oasis/sources/spark-3.2.0/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/oasis/.ivy2/cache
The jars for the packages stored in: /Users/oasis/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-35cbb979-1b78-4cf9-b9fb-8868c1aa28cf;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.0.0 in central
	found io.delta#delta-storage;2.0.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 292ms :: artifacts dl 9ms
	:: modules in use:
	io.delta#delta-core_2.12;2.0.0 from central in [default]
	io.delta#delta-storage;2.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evi

## Data Lake Paths

In [4]:
raw_layer = "s3a://oasiscorp-raw/formula-oasis"

## Deifine Schema

In [5]:
circ_schema = StructType(fields=[StructField("circuitId", IntegerType(), False),
                            StructField("circuitRef", StringType(), True),
                            StructField("name", StringType(), True),
                            StructField("location", StringType(), True),
                            StructField("country", StringType(), True),
                            StructField("lat", FloatType(), True),
                            StructField("lng", FloatType(), True),
                            StructField("alt", IntegerType(), True),
                            StructField("url", StringType(), True)])

## Read the data and specify the schema in datareader API

In [6]:
# Read the data from the raw layer
circuits_sdf = spark.read \
                .option("header", "true") \
                .schema(circ_schema) \
                .csv(f"{raw_layer}/circuits.csv")

23/10/10 11:08:21 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [7]:
circuits_sdf.show(10)

[Stage 0:>                                                          (0 + 1) / 1]

+---------+--------------+--------------------+------------+---------+--------+--------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|     lat|     lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+--------+--------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497| 144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083| 101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325| 50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57| 2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|  29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 7.42056|

                                                                                

In [8]:
circuits_sdf.columns

['circuitId',
 'circuitRef',
 'name',
 'location',
 'country',
 'lat',
 'lng',
 'alt',
 'url']

In [9]:
## Select All columns besides the url 

from pyspark.sql.functions import col, lit, current_timestamp

circuits_sdf = circuits_sdf.select(
    col("circuitId").alias("circuit_id"),
    col("circuitRef").alias("circuit_ref"), 
    col("name").alias("circuit_name"), 
    col("location").alias("location"), 
    col("lat").alias("latitude"), 
    col("lng").alias("longitude"), 
    col("alt").alias("altitude"))

circuits_sdf.show(10)

[Stage 1:>                                                          (0 + 1) / 1]

+----------+--------------+--------------------+------------+--------+---------+--------+
|circuit_id|   circuit_ref|        circuit_name|    location|latitude|longitude|altitude|
+----------+--------------+--------------------+------------+--------+---------+--------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|-37.8497|  144.968|      10|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| 2.76083|  101.738|      18|
|         3|       bahrain|Bahrain Internati...|      Sakhir| 26.0325|  50.5106|       7|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|   41.57|  2.26111|     109|
|         5|      istanbul|       Istanbul Park|    Istanbul| 40.9517|   29.405|     130|
|         6|        monaco|   Circuit de Monaco| Monte-Carlo| 43.7347|  7.42056|       7|
|         7|    villeneuve|Circuit Gilles Vi...|    Montreal|    45.5| -73.5228|      13|
|         8|   magny_cours|Circuit de Nevers...| Magny Cours| 46.8642|  3.16361|     228|
|         

                                                                                

In [10]:
# add an ingestion date column
circuits_sdf = circuits_sdf.withColumn("ingested_date", lit(current_timestamp()))
circuits_sdf.show(10)

[Stage 2:>                                                          (0 + 1) / 1]

+----------+--------------+--------------------+------------+--------+---------+--------+--------------------+
|circuit_id|   circuit_ref|        circuit_name|    location|latitude|longitude|altitude|       ingested_date|
+----------+--------------+--------------------+------------+--------+---------+--------+--------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|-37.8497|  144.968|      10|2023-10-10 11:08:...|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| 2.76083|  101.738|      18|2023-10-10 11:08:...|
|         3|       bahrain|Bahrain Internati...|      Sakhir| 26.0325|  50.5106|       7|2023-10-10 11:08:...|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|   41.57|  2.26111|     109|2023-10-10 11:08:...|
|         5|      istanbul|       Istanbul Park|    Istanbul| 40.9517|   29.405|     130|2023-10-10 11:08:...|
|         6|        monaco|   Circuit de Monaco| Monte-Carlo| 43.7347|  7.42056|       7|2023-10-10 11:08:...|
|

                                                                                

In [11]:
# print Schema 

circuits_sdf.printSchema()

root
 |-- circuit_id: integer (nullable = true)
 |-- circuit_ref: string (nullable = true)
 |-- circuit_name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- altitude: integer (nullable = true)
 |-- ingested_date: timestamp (nullable = false)



## Write Data to S3 in parquet format

In [12]:
processed_layer = "s3a://oasiscorp-curated/formula-oasis"

In [13]:
# write in parquet format

circuits_sdf.write \
                .mode("overwrite") \
                    .parquet(f"{processed_layer}/circuits")

                                                                                

## Read Data Back From S3

In [14]:
# Read From Parquet

circuits_sdfs = spark.read.parquet(f"{processed_layer}/circuits")
circuits_sdfs.show(10)

[Stage 5:>                                                          (0 + 1) / 1]

+----------+--------------+--------------------+------------+--------+---------+--------+--------------------+
|circuit_id|   circuit_ref|        circuit_name|    location|latitude|longitude|altitude|       ingested_date|
+----------+--------------+--------------------+------------+--------+---------+--------+--------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|-37.8497|  144.968|      10|2023-10-10 11:08:...|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| 2.76083|  101.738|      18|2023-10-10 11:08:...|
|         3|       bahrain|Bahrain Internati...|      Sakhir| 26.0325|  50.5106|       7|2023-10-10 11:08:...|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|   41.57|  2.26111|     109|2023-10-10 11:08:...|
|         5|      istanbul|       Istanbul Park|    Istanbul| 40.9517|   29.405|     130|2023-10-10 11:08:...|
|         6|        monaco|   Circuit de Monaco| Monte-Carlo| 43.7347|  7.42056|       7|2023-10-10 11:08:...|
|

                                                                                

In [15]:
## Create a View For SQL Queries

circuits_sdfs.createOrReplaceTempView("circuits")

spark.sql("select * from circuits").show(10)

[Stage 6:>                                                          (0 + 1) / 1]

+----------+--------------+--------------------+------------+--------+---------+--------+--------------------+
|circuit_id|   circuit_ref|        circuit_name|    location|latitude|longitude|altitude|       ingested_date|
+----------+--------------+--------------------+------------+--------+---------+--------+--------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|-37.8497|  144.968|      10|2023-10-10 11:08:...|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| 2.76083|  101.738|      18|2023-10-10 11:08:...|
|         3|       bahrain|Bahrain Internati...|      Sakhir| 26.0325|  50.5106|       7|2023-10-10 11:08:...|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|   41.57|  2.26111|     109|2023-10-10 11:08:...|
|         5|      istanbul|       Istanbul Park|    Istanbul| 40.9517|   29.405|     130|2023-10-10 11:08:...|
|         6|        monaco|   Circuit de Monaco| Monte-Carlo| 43.7347|  7.42056|       7|2023-10-10 11:08:...|
|

                                                                                