## Import Libraries

In [1]:
# Import Spark libraries
from pyspark.sql import SparkSession
# For Schema 
from pyspark.sql.types import *
# For Column
from pyspark.sql.functions import col
# For Timestamp
from pyspark.sql.functions import current_timestamp
# Creates a Column Object From a Literal Value 
from pyspark.sql.functions import lit
# To Timestamp & Concatenate
from pyspark.sql.functions import to_timestamp, concat 
# Distinct Counts
from pyspark.sql.functions import countDistinct 

## Initiate Spark Session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DeltaLake") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()




:: loading settings :: url = jar:file:/Users/oasis/sources/spark-3.2.0/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/oasis/.ivy2/cache
The jars for the packages stored in: /Users/oasis/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c7324cd6-4c38-4963-abe0-08bdb05ebdb3;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.0.0 in central
	found io.delta#delta-storage;2.0.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 248ms :: artifacts dl 11ms
	:: modules in use:
	io.delta#delta-core_2.12;2.0.0 from central in [default]
	io.delta#delta-storage;2.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|ev

## Data Lake Paths

In [3]:
raw_layer = "s3a://oasiscorp-raw/formula-oasis"

## Read the data and specify the schema in datareader API

In [4]:
# Define the schema
qualifying_schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                                      StructField("raceId", IntegerType(), True),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("constructorId", IntegerType(), True),
                                      StructField("number", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("q1", StringType(), True),
                                      StructField("q2", StringType(), True),
                                      StructField("q3", StringType(), True),
                                     ])

# Read the data from the CSV file with the defined schema
qual_sdf = spark.read \
                            .schema(qualifying_schema) \
                            .option("multiLine", True) \
                            .json(f"{raw_layer}/qualifying")

23/10/10 13:42:33 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [5]:
qual_sdf.columns

['qualifyId',
 'raceId',
 'driverId',
 'constructorId',
 'number',
 'position',
 'q1',
 'q2',
 'q3']

## Rename & Add new dataframes

In [6]:
qual_sdf_curated = qual_sdf.withColumnRenamed("qualifyId", "qualify_id") \
.withColumnRenamed("driverId", "driver_id") \
.withColumnRenamed("raceId", "race_id") \
.withColumnRenamed("constructorId", "constructor_id") \
.withColumn("ingestion_date", current_timestamp())

In [7]:
qual_sdf_curated.show()

[Stage 0:>                                                          (0 + 1) / 1]

+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
|qualify_id|race_id|driver_id|constructor_id|number|position|      q1|      q2|      q3|      ingestion_date|
+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
|         1|     18|        1|             1|    22|       1|1:26.572|1:25.187|1:26.714|2023-10-10 13:44:...|
|         2|     18|        9|             2|     4|       2|1:26.103|1:25.315|1:26.869|2023-10-10 13:44:...|
|         3|     18|        5|             1|    23|       3|1:25.664|1:25.452|1:27.079|2023-10-10 13:44:...|
|         4|     18|       13|             6|     2|       4|1:25.994|1:25.691|1:27.178|2023-10-10 13:44:...|
|         5|     18|        2|             2|     3|       5|1:25.960|1:25.518|1:27.236|2023-10-10 13:44:...|
|         6|     18|       15|             7|    11|       6|1:26.427|1:26.101|1:28.527|2023-10-10 13:44:...|
|         

                                                                                

## Write Data to S3 in parquet format

In [8]:
processed_layer = "s3a://oasiscorp-curated/formula-oasis"

In [9]:
# write in parquet format

qual_sdf_curated.write \
                .mode("overwrite") \
                    .parquet(f"{processed_layer}/qualifying")

                                                                                

## Read Data Back From S3

In [10]:
# Read From Parquet

races_df = spark.read.parquet(f"{processed_layer}/qualifying")
races_df.show(10)

[Stage 3:>                                                          (0 + 1) / 1]

+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
|qualify_id|race_id|driver_id|constructor_id|number|position|      q1|      q2|      q3|      ingestion_date|
+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
|         1|     18|        1|             1|    22|       1|1:26.572|1:25.187|1:26.714|2023-10-10 13:45:...|
|         2|     18|        9|             2|     4|       2|1:26.103|1:25.315|1:26.869|2023-10-10 13:45:...|
|         3|     18|        5|             1|    23|       3|1:25.664|1:25.452|1:27.079|2023-10-10 13:45:...|
|         4|     18|       13|             6|     2|       4|1:25.994|1:25.691|1:27.178|2023-10-10 13:45:...|
|         5|     18|        2|             2|     3|       5|1:25.960|1:25.518|1:27.236|2023-10-10 13:45:...|
|         6|     18|       15|             7|    11|       6|1:26.427|1:26.101|1:28.527|2023-10-10 13:45:...|
|         

                                                                                