### Ingest qualifying json files

##### Step 1 - Read the JSON file using the spark dataframe reader API

In [1]:
from pandas import read_csv,read_json,concat
from glob import glob
from lib import configuration
from lib import common_functions

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

In [3]:
spark = common_functions.get_spark_session()

In [4]:
results_schema = StructType(fields=[StructField("resultId", IntegerType(), False),
                                    StructField("raceId", IntegerType(), True),
                                    StructField("driverId", IntegerType(), True),
                                    StructField("constructorId", IntegerType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("grid", IntegerType(), True),
                                    StructField("position", IntegerType(), True),
                                    StructField("positionText", StringType(), True),
                                    StructField("positionOrder", IntegerType(), True),
                                    StructField("points", FloatType(), True),
                                    StructField("laps", IntegerType(), True),
                                    StructField("time", StringType(), True),
                                    StructField("milliseconds", IntegerType(), True),
                                    StructField("fastestLap", IntegerType(), True),
                                    StructField("rank", IntegerType(), True),
                                    StructField("fastestLapTime", StringType(), True),
                                    StructField("fastestLapSpeed", FloatType(), True),
                                    StructField("statusId", StringType(), True)])

In [5]:
folder_path = f"{configuration.api_folder_path}/results"
all_files = glob(folder_path + "/*.json")

dfs = [read_json(file, lines=True) for file in all_files]
results_df = concat(dfs, ignore_index=True)
results_df

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,statusId
0,1,British Grand Prix1950,farina,alfa,2,1,1,1,positionOrder,9.0,70,Finished
1,2,British Grand Prix1950,fagioli,alfa,3,2,2,2,positionOrder,6.0,70,Finished
2,3,British Grand Prix1950,reg_parnell,alfa,4,4,3,3,positionOrder,4.0,70,Finished
3,4,British Grand Prix1950,cabantous,lago,14,6,4,4,positionOrder,3.0,68,+2 Laps
4,5,British Grand Prix1950,rosier,lago,15,9,5,5,positionOrder,2.0,68,+2 Laps
...,...,...,...,...,...,...,...,...,...,...,...,...
26434,9996,Dutch Grand Prix1984,piquet,brabham,1,2,25,R,positionOrder,0.0,10,Oil pressure
26435,9997,Dutch Grand Prix1984,ghinzani,osella,24,21,26,R,positionOrder,0.0,8,Fuel pump
26436,9998,Dutch Grand Prix1984,alboreto,ferrari,27,9,27,R,positionOrder,0.0,7,Engine
26437,9999,Italian Grand Prix1984,lauda,mclaren,8,4,1,1,positionOrder,9.0,51,Finished


##### Step 2 - Rename columns and add new columns
1. Rename qualifyingId, driverId, constructorId and raceId
1. Add ingestion_date with current timestamp

In [6]:
results_df["data_source"] = configuration.v_data_source
results_df["ingestion_date"] = common_functions.get_ingestion_date()
results_df

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,statusId,data_source,ingestion_date
0,1,British Grand Prix1950,farina,alfa,2,1,1,1,positionOrder,9.0,70,Finished,dev,2024-05-29
1,2,British Grand Prix1950,fagioli,alfa,3,2,2,2,positionOrder,6.0,70,Finished,dev,2024-05-29
2,3,British Grand Prix1950,reg_parnell,alfa,4,4,3,3,positionOrder,4.0,70,Finished,dev,2024-05-29
3,4,British Grand Prix1950,cabantous,lago,14,6,4,4,positionOrder,3.0,68,+2 Laps,dev,2024-05-29
4,5,British Grand Prix1950,rosier,lago,15,9,5,5,positionOrder,2.0,68,+2 Laps,dev,2024-05-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26434,9996,Dutch Grand Prix1984,piquet,brabham,1,2,25,R,positionOrder,0.0,10,Oil pressure,dev,2024-05-29
26435,9997,Dutch Grand Prix1984,ghinzani,osella,24,21,26,R,positionOrder,0.0,8,Fuel pump,dev,2024-05-29
26436,9998,Dutch Grand Prix1984,alboreto,ferrari,27,9,27,R,positionOrder,0.0,7,Engine,dev,2024-05-29
26437,9999,Italian Grand Prix1984,lauda,mclaren,8,4,1,1,positionOrder,9.0,51,Finished,dev,2024-05-29


In [7]:
results_final_df = results_df.rename(columns={"resultId":"result_id","raceId":"race_id","driverId":"driver_id","constructorId":"constructor_id","positionText":"position_text","positionOrder":"position_order","statusId":"status"})
results_final_df

Unnamed: 0,result_id,race_id,driver_id,constructor_id,number,grid,position,position_text,position_order,points,laps,status,data_source,ingestion_date
0,1,British Grand Prix1950,farina,alfa,2,1,1,1,positionOrder,9.0,70,Finished,dev,2024-05-29
1,2,British Grand Prix1950,fagioli,alfa,3,2,2,2,positionOrder,6.0,70,Finished,dev,2024-05-29
2,3,British Grand Prix1950,reg_parnell,alfa,4,4,3,3,positionOrder,4.0,70,Finished,dev,2024-05-29
3,4,British Grand Prix1950,cabantous,lago,14,6,4,4,positionOrder,3.0,68,+2 Laps,dev,2024-05-29
4,5,British Grand Prix1950,rosier,lago,15,9,5,5,positionOrder,2.0,68,+2 Laps,dev,2024-05-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26434,9996,Dutch Grand Prix1984,piquet,brabham,1,2,25,R,positionOrder,0.0,10,Oil pressure,dev,2024-05-29
26435,9997,Dutch Grand Prix1984,ghinzani,osella,24,21,26,R,positionOrder,0.0,8,Fuel pump,dev,2024-05-29
26436,9998,Dutch Grand Prix1984,alboreto,ferrari,27,9,27,R,positionOrder,0.0,7,Engine,dev,2024-05-29
26437,9999,Italian Grand Prix1984,lauda,mclaren,8,4,1,1,positionOrder,9.0,51,Finished,dev,2024-05-29


##### Step 3 - Write to output to processed container in parquet format

In [8]:
results_final_df.to_csv(f"{configuration.silver_api_folder_path}/results.csv", index=False)

In [9]:
df_parquet = read_csv(f'{configuration.silver_api_folder_path}/results.csv')
df_parquet

Unnamed: 0,result_id,race_id,driver_id,constructor_id,number,grid,position,position_text,position_order,points,laps,status,data_source,ingestion_date
0,1,British Grand Prix1950,farina,alfa,2.0,1,1,1,positionOrder,9.0,70,Finished,dev,2024-05-29
1,2,British Grand Prix1950,fagioli,alfa,3.0,2,2,2,positionOrder,6.0,70,Finished,dev,2024-05-29
2,3,British Grand Prix1950,reg_parnell,alfa,4.0,4,3,3,positionOrder,4.0,70,Finished,dev,2024-05-29
3,4,British Grand Prix1950,cabantous,lago,14.0,6,4,4,positionOrder,3.0,68,+2 Laps,dev,2024-05-29
4,5,British Grand Prix1950,rosier,lago,15.0,9,5,5,positionOrder,2.0,68,+2 Laps,dev,2024-05-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26434,9996,Dutch Grand Prix1984,piquet,brabham,1.0,2,25,R,positionOrder,0.0,10,Oil pressure,dev,2024-05-29
26435,9997,Dutch Grand Prix1984,ghinzani,osella,24.0,21,26,R,positionOrder,0.0,8,Fuel pump,dev,2024-05-29
26436,9998,Dutch Grand Prix1984,alboreto,ferrari,27.0,9,27,R,positionOrder,0.0,7,Engine,dev,2024-05-29
26437,9999,Italian Grand Prix1984,lauda,mclaren,8.0,4,1,1,positionOrder,9.0,51,Finished,dev,2024-05-29
