### Ingest pit_stops.json file

In [13]:
from pandas import read_csv,read_json
from lib import configuration
from lib import common_functions

##### Step 1 - Read the JSON file using the spark dataframe reader API

In [14]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [15]:
spark = common_functions.get_spark_session()

In [16]:
pit_stops_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("stop", StringType(), True),
                                      StructField("lap", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("duration", StringType(), True),
                                      StructField("milliseconds", IntegerType(), True)
                                     ])

In [23]:
pit_stops_df = spark.read \
.schema(pit_stops_schema) \
.json(f"{configuration.bronze_folder_path}/pit_stops.json")

pit_stops_df = read_json(f'{configuration.bronze_folder_path}/pit_stops.json')
pit_stops_df

Unnamed: 0,raceId,driverId,stop,lap,time,duration,milliseconds
0,841,153,1,1,17:05:23,26.898,26898
1,841,30,1,1,17:05:52,25.021,25021
2,841,17,1,11,17:20:48,23.426,23426
3,841,4,1,12,17:22:34,23.251,23251
4,841,13,1,13,17:24:10,23.842,23842
...,...,...,...,...,...,...,...
8025,1047,20,1,35,18:17:15,22.04,22040
8026,1047,849,2,35,18:17:41,22.384,22384
8027,1047,817,1,39,18:23:37,22.123,22123
8028,1047,825,2,47,18:39:11,23.098,23098


##### Step 2 - Rename columns and add new columns
1. Rename driverId and raceId
1. Add ingestion_date with current timestamp

In [24]:
pit_stops_df["ingestion_date"] = common_functions.get_ingestion_date()
pit_stops_df["data_source"] = configuration.v_data_source
pit_stops_df

Unnamed: 0,raceId,driverId,stop,lap,time,duration,milliseconds,ingestion_date,data_source
0,841,153,1,1,17:05:23,26.898,26898,10/05/2024 16:04:37,dev
1,841,30,1,1,17:05:52,25.021,25021,10/05/2024 16:04:37,dev
2,841,17,1,11,17:20:48,23.426,23426,10/05/2024 16:04:37,dev
3,841,4,1,12,17:22:34,23.251,23251,10/05/2024 16:04:37,dev
4,841,13,1,13,17:24:10,23.842,23842,10/05/2024 16:04:37,dev
...,...,...,...,...,...,...,...,...,...
8025,1047,20,1,35,18:17:15,22.04,22040,10/05/2024 16:04:37,dev
8026,1047,849,2,35,18:17:41,22.384,22384,10/05/2024 16:04:37,dev
8027,1047,817,1,39,18:23:37,22.123,22123,10/05/2024 16:04:37,dev
8028,1047,825,2,47,18:39:11,23.098,23098,10/05/2024 16:04:37,dev


In [26]:
final_df = pit_stops_df.rename(columns={"raceId":"race_id"})
final_df

Unnamed: 0,race_id,driverId,stop,lap,time,duration,milliseconds,ingestion_date,data_source
0,841,153,1,1,17:05:23,26.898,26898,10/05/2024 16:04:37,dev
1,841,30,1,1,17:05:52,25.021,25021,10/05/2024 16:04:37,dev
2,841,17,1,11,17:20:48,23.426,23426,10/05/2024 16:04:37,dev
3,841,4,1,12,17:22:34,23.251,23251,10/05/2024 16:04:37,dev
4,841,13,1,13,17:24:10,23.842,23842,10/05/2024 16:04:37,dev
...,...,...,...,...,...,...,...,...,...
8025,1047,20,1,35,18:17:15,22.04,22040,10/05/2024 16:04:37,dev
8026,1047,849,2,35,18:17:41,22.384,22384,10/05/2024 16:04:37,dev
8027,1047,817,1,39,18:23:37,22.123,22123,10/05/2024 16:04:37,dev
8028,1047,825,2,47,18:39:11,23.098,23098,10/05/2024 16:04:37,dev


##### Step 3 - Write to output to processed container in parquet format

In [27]:
final_df.to_csv(f"{configuration.silver_folder_path}/pit_stops.csv", index=False)

In [28]:
df_parquet = read_csv(f'{configuration.silver_folder_path}/pit_stops.csv')
df_parquet

Unnamed: 0,race_id,driverId,stop,lap,time,duration,milliseconds,ingestion_date,data_source
0,841,153,1,1,17:05:23,26.898,26898,10/05/2024 16:04:37,dev
1,841,30,1,1,17:05:52,25.021,25021,10/05/2024 16:04:37,dev
2,841,17,1,11,17:20:48,23.426,23426,10/05/2024 16:04:37,dev
3,841,4,1,12,17:22:34,23.251,23251,10/05/2024 16:04:37,dev
4,841,13,1,13,17:24:10,23.842,23842,10/05/2024 16:04:37,dev
...,...,...,...,...,...,...,...,...,...
8025,1047,20,1,35,18:17:15,22.04,22040,10/05/2024 16:04:37,dev
8026,1047,849,2,35,18:17:41,22.384,22384,10/05/2024 16:04:37,dev
8027,1047,817,1,39,18:23:37,22.123,22123,10/05/2024 16:04:37,dev
8028,1047,825,2,47,18:39:11,23.098,23098,10/05/2024 16:04:37,dev
