## Import Libraries

In [1]:
# Import Necessary Libraries
import os
import hashlib
import urllib.request
import json
from datetime import timedelta, date

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

from delta.tables import DeltaTable

## Initiate Spark Session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DeltaLake") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()




:: loading settings :: url = jar:file:/Users/oasis/sources/spark-3.2.0/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/oasis/.ivy2/cache
The jars for the packages stored in: /Users/oasis/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f645ac7f-67ca-4666-964e-0989f89e845c;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.0.0 in central
	found io.delta#delta-storage;2.0.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 319ms :: artifacts dl 9ms
	:: modules in use:
	io.delta#delta-core_2.12;2.0.0 from central in [default]
	io.delta#delta-storage;2.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evi

## Data Lake Paths

In [3]:
raw_layer = "s3a://oasiscorp-raw/formula-oasis"

## Deifine Schema

In [4]:
# Define the schema using StructType and StructField
results_schema = StructType([

    StructField("resultId", IntegerType(), nullable=True),
    StructField("raceId", IntegerType(), nullable=True),
    StructField("driverId", IntegerType(), nullable=True),
    StructField("constructorId", IntegerType(), nullable=False),
    StructField("number", IntegerType(), nullable=True),
    StructField("grid", IntegerType(), nullable=True),
    StructField("position", IntegerType(), nullable=True),
    StructField("positionText", StringType(), nullable=True),
    StructField("positionOrder", IntegerType(), nullable=True),
    StructField("points", FloatType(), nullable=True),
    StructField("laps", IntegerType(), nullable=True),
    StructField("time", StringType(), nullable=True),
    StructField("milliseconds", IntegerType(), nullable=True),
    StructField("fastestLap", IntegerType(), nullable=True),
    StructField("rank", StringType(), nullable=True),
    StructField("fastestLapTime", StringType(), nullable=True),
    StructField("fastestLapSpeed", FloatType(), nullable=True),
    StructField("statusId", IntegerType(), nullable=True),
    
])

## Read the data and specify the schema in datareader API

In [5]:
# Read the data from the raw layer
results_df = spark.read \
                .schema(results_schema) \
                .json(f"{raw_layer}/results.json")

23/10/10 12:26:08 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [6]:
results_df.describe().show()



+-------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------+------------------+-----------------+
|summary|          resultId|           raceId|          driverId|    constructorId|            number|              grid|         position|      positionText|    positionOrder|            points|              laps|              time|      milliseconds|        fastestLap|              rank|fastestLapTime|   fastestLapSpeed|         statusId|
+-------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------+---------

                                                                                

In [7]:
results_df.columns

['resultId',
 'raceId',
 'driverId',
 'constructorId',
 'number',
 'grid',
 'position',
 'positionText',
 'positionOrder',
 'points',
 'laps',
 'time',
 'milliseconds',
 'fastestLap',
 'rank',
 'fastestLapTime',
 'fastestLapSpeed',
 'statusId']

In [8]:
## rename and add new dataframes

results_sdf_curated = results_df \
        .withColumnRenamed("constructorId", "constructor_id") \
        .withColumnRenamed("driverId", "driver_id") \
        .withColumnRenamed("fastestLap", "fastest_lap") \
        .withColumnRenamed("fastestLapSpeed", "fastest_lap_speed") \
        .withColumnRenamed("fastestLapTime", "fastest_lap_time") \
        .withColumnRenamed("positionOrder", "position_order") \
        .withColumnRenamed("positionText", "position_text") \
        .withColumnRenamed("raceId", "race_id") \
        .withColumnRenamed("resultId", "result_id") \
        .withColumnRenamed("time", "time") \
        .withColumn("ingested_date", current_timestamp())

In [9]:
# drop statusId column
results_sdf_curated = results_sdf_curated.drop("statusId")

In [10]:
results_sdf_curated.show()

[Stage 3:>                                                          (0 + 1) / 1]

+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+
|result_id|race_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|       time|milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|       ingested_date|
+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+
|        1|     18|        1|             1|    22|   1|       1|            1|             1|  10.0|  58|1:34:50.616|     5690616|         39|   2|        1:27.452|            218.3|2023-10-10 12:32:...|
|        2|     18|        2|             2|     3|   5|       2|            2|             2|   8.0|  58|     +5.478|     5696094|         41|   3|        1:27.739|          217.5

                                                                                

## Write Data to S3 in parquet format

In [11]:
processed_layer = "s3a://oasiscorp-curated/formula-oasis"

In [12]:
# write in parquet format

results_sdf_curated.write \
                .mode("overwrite") \
                    .parquet(f"{processed_layer}/results")

                                                                                

## Read Data Back From S3

In [13]:
# Read From Parquet

races_df = spark.read.parquet(f"{processed_layer}/results")
races_df.show(10)

[Stage 6:>                                                          (0 + 1) / 1]

+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+
|result_id|race_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|       time|milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|       ingested_date|
+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+
|        1|     18|        1|             1|    22|   1|       1|            1|             1|  10.0|  58|1:34:50.616|     5690616|         39|   2|        1:27.452|            218.3|2023-10-10 12:32:...|
|        2|     18|        2|             2|     3|   5|       2|            2|             2|   8.0|  58|     +5.478|     5696094|         41|   3|        1:27.739|          217.5

                                                                                