In [2]:
import os
import findspark
import boto3
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/temurin-22.jdk/Contents/Home"
os.environ["SPARK_HOME"] = "spark-3.5.1-bin-hadoop3"
findspark.init()
findspark.find() # Should return '/content/spark-3.5.1-bin-hadoop3'

'spark-3.5.1-bin-hadoop3'

In [3]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

SparkContext.setSystemProperty('spark.executor.memory', '2g')

spark = (
    SparkSession.builder.master("local")
    .appName("Colab")
    .config("spark.ui.port", "4050")
    .getOrCreate()
)
sc = spark.sparkContext

sc

24/05/04 00:29:19 WARN Utils: Your hostname, Joses-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.20.10.4 instead (on interface en0)
24/05/04 00:29:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/04 00:29:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
session = boto3.Session(
    aws_access_key_id=os.environ["CUBBIT_ACCESS_KEY_ID"],
    aws_secret_access_key=os.environ["CUBBIT_SECRET_ACCESS_KEY"],
    region_name="eu-west-1",
)

s3 = session.client("s3", endpoint_url="https://s3.cubbit.eu")

In [5]:
drivers_parquet = s3.download_file(Bucket="f1-bucket", Key="drivers.parquet", Filename="./data/drivers.parquet")

In [6]:
df = spark.read.parquet('./data/drivers.parquet')

                                                                                

In [7]:
non_duplicates = df.dropDuplicates(['driver_number'])
non_duplicates.show(47)

                                                                                

+-------------+-----------------+------------+---------------+------------+-----------+-----------+
|driver_number|        full_name|name_acronym|      team_name|country_code|session_key|meeting_key|
+-------------+-----------------+------------+---------------+------------+-----------+-----------+
|            1|   Max VERSTAPPEN|         VER|Red Bull Racing|         NED|       7763|       1140|
|            2|   Logan SARGEANT|         SAR|       Williams|         USA|       7763|       1140|
|            3| Daniel RICCIARDO|         RIC|     AlphaTauri|         AUS|       9127|       1215|
|            4|     Lando NORRIS|         NOR|        McLaren|         GBR|       7763|       1140|
|            5|Gabriel BORTOLETO|         BOR|           NULL|        NULL|       9223|       1215|
|            6|    Oliver GOETHE|         GOE|           NULL|        NULL|       9223|       1215|
|            7| Kaylen FREDERICK|         FRE|           NULL|        NULL|       9223|       1215|


In [8]:
s3.download_file(Bucket="f1-bucket", Key="pit.parquet", Filename="./data/pit.parquet")

df_pit = spark.read.parquet('./data/pit.parquet')
df_pit.show()

+------------+----------+-------------+--------------------+-----------+-----------+
|pit_duration|lap_number|driver_number|                date|session_key|meeting_key|
+------------+----------+-------------+--------------------+-----------+-----------+
|        35.3|         3|           14|2023-06-02T11:33:...|       9095|       1211|
|        40.8|         4|           18|2023-06-02T11:35:...|       9095|       1211|
|       155.2|         3|           21|2023-06-02T11:36:...|       9095|       1211|
|       277.0|         2|            2|2023-06-02T11:37:...|       9095|       1211|
|        27.6|         5|           16|2023-06-02T11:37:...|       9095|       1211|
|       343.6|         2|           81|2023-06-02T11:38:...|       9095|       1211|
|        24.9|         6|           11|2023-06-02T11:38:...|       9095|       1211|
|        47.4|         6|           20|2023-06-02T11:38:...|       9095|       1211|
|       381.5|         2|            4|2023-06-02T11:39:...|     

In [9]:
cols_to_drop = ['date', 'meeting_key']

filtered_df_pit = df_pit.drop(*cols_to_drop)
filtered_df_pit.show()

+------------+----------+-------------+-----------+
|pit_duration|lap_number|driver_number|session_key|
+------------+----------+-------------+-----------+
|        35.3|         3|           14|       9095|
|        40.8|         4|           18|       9095|
|       155.2|         3|           21|       9095|
|       277.0|         2|            2|       9095|
|        27.6|         5|           16|       9095|
|       343.6|         2|           81|       9095|
|        24.9|         6|           11|       9095|
|        47.4|         6|           20|       9095|
|       381.5|         2|            4|       9095|
|        23.5|         6|            1|       9095|
|        40.8|         6|           27|       9095|
|       517.4|         3|           77|       9095|
|       298.0|         5|           18|       9095|
|        25.1|         9|           23|       9095|
|       623.1|         3|           24|       9095|
|       245.8|         8|           22|       9095|
|        23.

In [21]:
# s3.download_file(Bucket="f1-bucket", Key="sessions.parquet", Filename="./data/sessions.parquet")

df_sessions = spark.read.parquet('./data/sessions.parquet')
df_sessions.show()

+---------+-----------+------------+------------+-----------+------------------+------------+------------+--------------------+--------------------+----------+-----------+-----------+----+
| location|country_key|country_code|country_name|circuit_key|circuit_short_name|session_type|session_name|          date_start|            date_end|gmt_offset|session_key|meeting_key|year|
+---------+-----------+------------+------------+-----------+------------------+------------+------------+--------------------+--------------------+----------+-----------+-----------+----+
|   Sakhir|         36|         BRN|     Bahrain|         63|            Sakhir|    Practice|  Practice 1|2023-02-23T07:00:...|2023-02-23T16:30:...|  03:00:00|       9222|       1140|2023|
|   Sakhir|         36|         BRN|     Bahrain|         63|            Sakhir|    Practice|  Practice 2|2023-02-24T07:00:...|2023-02-24T16:30:...|  03:00:00|       7763|       1140|2023|
|   Sakhir|         36|         BRN|     Bahrain|      

In [22]:
s3.download_file(Bucket="f1-bucket", Key="position.parquet", Filename="./data/position.parquet")

df_positions = spark.read.parquet('./data/position.parquet')
df_positions.show()
display(df_positions.count())

+--------+-------------+--------------------+-----------+-----------+
|position|driver_number|                date|session_key|meeting_key|
+--------+-------------+--------------------+-----------+-----------+
|       1|            1|2023-02-23T06:54:...|       9222|       1140|
|       2|            1|2023-02-23T07:00:...|       9222|       1140|
|       3|            1|2023-02-23T07:00:...|       9222|       1140|
|       4|            1|2023-02-23T07:00:...|       9222|       1140|
|       5|            1|2023-02-23T07:00:...|       9222|       1140|
|       6|            1|2023-02-23T07:00:...|       9222|       1140|
|       1|            1|2023-02-23T07:04:...|       9222|       1140|
|       2|            1|2023-02-23T07:33:...|       9222|       1140|
|       1|            1|2023-02-23T07:34:...|       9222|       1140|
|       2|            1|2023-02-23T07:56:...|       9222|       1140|
|       3|            1|2023-02-23T08:09:...|       9222|       1140|
|       4|          

104587