## Import Libraries

In [1]:
# Import Necessary Libraries
import os
import hashlib
import urllib.request
import json
from datetime import timedelta, date

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

from delta.tables import DeltaTable

## Initiate Spark Session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DeltaLake") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()




:: loading settings :: url = jar:file:/Users/oasis/sources/spark-3.2.0/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/oasis/.ivy2/cache
The jars for the packages stored in: /Users/oasis/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d23d120a-b5fb-40ce-a7f6-e4e2782e4f04;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.0.0 in central
	found io.delta#delta-storage;2.0.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 270ms :: artifacts dl 9ms
	:: modules in use:
	io.delta#delta-core_2.12;2.0.0 from central in [default]
	io.delta#delta-storage;2.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evi

## Data Lake Paths

In [3]:
raw_layer = "s3a://oasiscorp-raw/formula-oasis"

## Read the data and specify the schema in datareader API

In [4]:
# Define the schema
# Define the schema using StructType and StructField
pit_schema = StructType([

    StructField("raceId", IntegerType(), nullable=True),
    StructField("driverId", IntegerType(), nullable=True),
    StructField("stop", StringType(), nullable=True),
    StructField("lap", IntegerType(), nullable=False),
    StructField("time", StringType(), nullable=True),
    StructField("duration", StringType(), nullable=True),
    StructField("milliseconds", IntegerType(), nullable=True)   
])


pits_sdf = spark.read \
                            .schema(pit_schema) \
                            .option("multiline", True) \
                            .json(f"{raw_layer}/pit_stops.json")

23/10/10 13:24:32 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [5]:
pits_sdf.columns

['raceId', 'driverId', 'stop', 'lap', 'time', 'duration', 'milliseconds']

In [6]:
pits_sdf.show()

[Stage 0:>                                                          (0 + 1) / 1]

+------+--------+----+---+--------+--------+------------+
|raceId|driverId|stop|lap|    time|duration|milliseconds|
+------+--------+----+---+--------+--------+------------+
|   841|     153|   1|  1|17:05:23|  26.898|       26898|
|   841|      30|   1|  1|17:05:52|  25.021|       25021|
|   841|      17|   1| 11|17:20:48|  23.426|       23426|
|   841|       4|   1| 12|17:22:34|  23.251|       23251|
|   841|      13|   1| 13|17:24:10|  23.842|       23842|
|   841|      22|   1| 13|17:24:29|  23.643|       23643|
|   841|      20|   1| 14|17:25:17|  22.603|       22603|
|   841|     814|   1| 14|17:26:03|  24.863|       24863|
|   841|     816|   1| 14|17:26:50|  25.259|       25259|
|   841|      67|   1| 15|17:27:34|  25.342|       25342|
|   841|       2|   1| 15|17:27:41|  22.994|       22994|
|   841|       1|   1| 16|17:28:24|  23.227|       23227|
|   841|     808|   1| 16|17:28:39|  24.535|       24535|
|   841|       3|   1| 16|17:29:00|  23.716|       23716|
|   841|     1

                                                                                

In [7]:
## rename and add new dataframes

pits_sdf_curated = pits_sdf \
        .withColumnRenamed("raceId", "race_id") \
        .withColumnRenamed("driverId", "driver_id") \
        .withColumn("ingested_date", current_timestamp())

In [8]:
pits_sdf_curated.show()

[Stage 1:>                                                          (0 + 1) / 1]

+-------+---------+----+---+--------+--------+------------+--------------------+
|race_id|driver_id|stop|lap|    time|duration|milliseconds|       ingested_date|
+-------+---------+----+---+--------+--------+------------+--------------------+
|    841|      153|   1|  1|17:05:23|  26.898|       26898|2023-10-10 13:25:...|
|    841|       30|   1|  1|17:05:52|  25.021|       25021|2023-10-10 13:25:...|
|    841|       17|   1| 11|17:20:48|  23.426|       23426|2023-10-10 13:25:...|
|    841|        4|   1| 12|17:22:34|  23.251|       23251|2023-10-10 13:25:...|
|    841|       13|   1| 13|17:24:10|  23.842|       23842|2023-10-10 13:25:...|
|    841|       22|   1| 13|17:24:29|  23.643|       23643|2023-10-10 13:25:...|
|    841|       20|   1| 14|17:25:17|  22.603|       22603|2023-10-10 13:25:...|
|    841|      814|   1| 14|17:26:03|  24.863|       24863|2023-10-10 13:25:...|
|    841|      816|   1| 14|17:26:50|  25.259|       25259|2023-10-10 13:25:...|
|    841|       67|   1| 15|

                                                                                

## Write Data to S3 in parquet format

In [10]:
processed_layer = "s3a://oasiscorp-curated/formula-oasis"

In [11]:
# write in parquet format

pits_sdf_curated.write \
                .mode("overwrite") \
                    .parquet(f"{processed_layer}/pit_stops")

                                                                                

## Read Data Back From S3

In [12]:
# Read From Parquet

races_df = spark.read.parquet(f"{processed_layer}/pit_stops")
races_df.show(10)

[Stage 4:>                                                          (0 + 1) / 1]

+-------+---------+----+---+--------+--------+------------+--------------------+
|race_id|driver_id|stop|lap|    time|duration|milliseconds|       ingested_date|
+-------+---------+----+---+--------+--------+------------+--------------------+
|    841|      153|   1|  1|17:05:23|  26.898|       26898|2023-10-10 13:25:...|
|    841|       30|   1|  1|17:05:52|  25.021|       25021|2023-10-10 13:25:...|
|    841|       17|   1| 11|17:20:48|  23.426|       23426|2023-10-10 13:25:...|
|    841|        4|   1| 12|17:22:34|  23.251|       23251|2023-10-10 13:25:...|
|    841|       13|   1| 13|17:24:10|  23.842|       23842|2023-10-10 13:25:...|
|    841|       22|   1| 13|17:24:29|  23.643|       23643|2023-10-10 13:25:...|
|    841|       20|   1| 14|17:25:17|  22.603|       22603|2023-10-10 13:25:...|
|    841|      814|   1| 14|17:26:03|  24.863|       24863|2023-10-10 13:25:...|
|    841|      816|   1| 14|17:26:50|  25.259|       25259|2023-10-10 13:25:...|
|    841|       67|   1| 15|

                                                                                