## Import Libraries

In [1]:
# Import Spark libraries
from pyspark.sql import SparkSession
# For Schema 
from pyspark.sql.types import *
# For Column
from pyspark.sql.functions import col
# For Timestamp
from pyspark.sql.functions import current_timestamp
# Creates a Column Object From a Literal Value 
from pyspark.sql.functions import lit
# To Timestamp & Concatenate
from pyspark.sql.functions import to_timestamp, concat 
# Distinct Counts
from pyspark.sql.functions import countDistinct 

## Initiate Spark Session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DeltaLake") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()




:: loading settings :: url = jar:file:/Users/oasis/sources/spark-3.2.0/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/oasis/.ivy2/cache
The jars for the packages stored in: /Users/oasis/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3acb5504-1a93-42f4-9d5d-2a4f20e78792;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.0.0 in central
	found io.delta#delta-storage;2.0.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 283ms :: artifacts dl 21ms
	:: modules in use:
	io.delta#delta-core_2.12;2.0.0 from central in [default]
	io.delta#delta-storage;2.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|ev

## Data Lake Paths

In [3]:
raw_layer = "s3a://oasiscorp-raw/formula-oasis"

## Read the data and specify the schema in datareader API

In [4]:
# Define the schema
lap_times_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("lap", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("milliseconds", IntegerType(), True)
                                     ])

# Read the data from the CSV file with the defined schema
lap_times_sdf = spark.read \
                            .schema(lap_times_schema) \
                            .csv(f"{raw_layer}/lap_times")

23/10/10 13:30:59 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [6]:
lap_times_sdf.columns

['raceId', 'driverId', 'lap', 'position', 'time', 'milliseconds']

In [5]:
lap_times_sdf.show()

[Stage 0:>                                                          (0 + 1) / 1]

+------+--------+---+--------+--------+------------+
|raceId|driverId|lap|position|    time|milliseconds|
+------+--------+---+--------+--------+------------+
|   841|      20|  1|       1|1:38.109|       98109|
|   841|      20|  2|       1|1:33.006|       93006|
|   841|      20|  3|       1|1:32.713|       92713|
|   841|      20|  4|       1|1:32.803|       92803|
|   841|      20|  5|       1|1:32.342|       92342|
|   841|      20|  6|       1|1:32.605|       92605|
|   841|      20|  7|       1|1:32.502|       92502|
|   841|      20|  8|       1|1:32.537|       92537|
|   841|      20|  9|       1|1:33.240|       93240|
|   841|      20| 10|       1|1:32.572|       92572|
|   841|      20| 11|       1|1:32.669|       92669|
|   841|      20| 12|       1|1:32.902|       92902|
|   841|      20| 13|       1|1:33.698|       93698|
|   841|      20| 14|       3|1:52.075|      112075|
|   841|      20| 15|       4|1:38.385|       98385|
|   841|      20| 16|       2|1:31.548|       

                                                                                

In [7]:
## rename and add new dataframes

laps_sdf_curated = lap_times_sdf \
    .withColumnRenamed("raceId", "race_id") \
    .withColumnRenamed("driverId", "driver_id") \
    .withColumn("ingested_date", current_timestamp())

In [8]:
laps_sdf_curated.show()

[Stage 1:>                                                          (0 + 1) / 1]

+-------+---------+---+--------+--------+------------+--------------------+
|race_id|driver_id|lap|position|    time|milliseconds|       ingested_date|
+-------+---------+---+--------+--------+------------+--------------------+
|    841|       20|  1|       1|1:38.109|       98109|2023-10-10 13:31:...|
|    841|       20|  2|       1|1:33.006|       93006|2023-10-10 13:31:...|
|    841|       20|  3|       1|1:32.713|       92713|2023-10-10 13:31:...|
|    841|       20|  4|       1|1:32.803|       92803|2023-10-10 13:31:...|
|    841|       20|  5|       1|1:32.342|       92342|2023-10-10 13:31:...|
|    841|       20|  6|       1|1:32.605|       92605|2023-10-10 13:31:...|
|    841|       20|  7|       1|1:32.502|       92502|2023-10-10 13:31:...|
|    841|       20|  8|       1|1:32.537|       92537|2023-10-10 13:31:...|
|    841|       20|  9|       1|1:33.240|       93240|2023-10-10 13:31:...|
|    841|       20| 10|       1|1:32.572|       92572|2023-10-10 13:31:...|
|    841|   

                                                                                

## Write Data to S3 in parquet format

In [9]:
processed_layer = "s3a://oasiscorp-curated/formula-oasis"

In [10]:
# write in parquet format

laps_sdf_curated.write \
                .mode("overwrite") \
                    .parquet(f"{processed_layer}/lap_times")

                                                                                

## Read Data Back From S3

In [11]:
# Read From Parquet

races_df = spark.read.parquet(f"{processed_layer}/lap_times")
races_df.show(10)

[Stage 4:>                                                          (0 + 1) / 1]

+-------+---------+---+--------+--------+------------+--------------------+
|race_id|driver_id|lap|position|    time|milliseconds|       ingested_date|
+-------+---------+---+--------+--------+------------+--------------------+
|     67|       14| 26|      13|1:25.802|       85802|2023-10-10 13:32:...|
|     67|       14| 27|      13|1:25.338|       85338|2023-10-10 13:32:...|
|     67|       14| 28|      13|1:25.395|       85395|2023-10-10 13:32:...|
|     67|       14| 29|      12|1:26.191|       86191|2023-10-10 13:32:...|
|     67|       14| 30|      11|1:25.439|       85439|2023-10-10 13:32:...|
|     67|       14| 31|      10|1:25.375|       85375|2023-10-10 13:32:...|
|     67|       14| 32|      12|1:28.219|       88219|2023-10-10 13:32:...|
|     67|       14| 33|      13|1:49.156|      109156|2023-10-10 13:32:...|
|     67|       14| 34|      13|1:25.128|       85128|2023-10-10 13:32:...|
|     67|       14| 35|      13|1:25.351|       85351|2023-10-10 13:32:...|
+-------+---

                                                                                