### Ingest qualifying json files

##### Step 1 - Read the JSON file using the spark dataframe reader API

In [1]:
from pandas import read_csv,read_json,concat
from glob import glob
from lib import configuration
from lib import common_functions

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [3]:
spark = common_functions.get_spark_session()

In [4]:
qualifying_schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                                      StructField("raceId", IntegerType(), True),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("constructorId", IntegerType(), True),
                                      StructField("number", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("q1", StringType(), True),
                                      StructField("q2", StringType(), True),
                                      StructField("q3", StringType(), True),
                                     ])

In [5]:
folder_path = f"{configuration.archive_bronze_folder_path}/qualifying"
all_files = glob(folder_path + "/*.json")

# dfs = spark.read \
# .option("header", True) \
# .schema(qualifying_schema) \
# .json(f"{configuration.bronze_folder_path}/qualifying/qualifying_split_1.json")

dfs = [read_json(file) for file in all_files]
qualifying_df = concat(dfs, ignore_index=True)
qualifying_df

Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3
0,1,18,1,1,22,1,1:26.572,1:25.187,1:26.714
1,2,18,9,2,4,2,1:26.103,1:25.315,1:26.869
2,3,18,5,1,23,3,1:25.664,1:25.452,1:27.079
3,4,18,13,6,2,4,1:25.994,1:25.691,1:27.178
4,5,18,2,2,3,5,1:25.960,1:25.518,1:27.236
...,...,...,...,...,...,...,...,...,...
8689,8730,1047,8,51,7,16,1:37.555,\N,\N
8690,8731,1047,825,210,20,17,1:37.863,\N,\N
8691,8732,1047,847,3,63,18,1:38.045,\N,\N
8692,8733,1047,850,210,51,19,1:38.173,\N,\N


##### Step 2 - Rename columns and add new columns
1. Rename qualifyingId, driverId, constructorId and raceId
1. Add ingestion_date with current timestamp

In [6]:
from pyspark.sql.functions import current_timestamp

In [7]:
# final_df = qualifying_df.withColumnRenamed("qualifyId", "qualify_id") \
# .withColumnRenamed("driverId", "driver_id") \
# .withColumnRenamed("raceId", "race_id") \
# .withColumnRenamed("constructorId", "constructor_id") \
# .withColumn("ingestion_date", current_timestamp())

qualifying_df = qualifying_df.rename(columns={"qualifyId":"qualify_id","driverId":"driver_id","raceId":"race_id","constructorId":"constructor_id"})
qualifying_df["data_source"] = configuration.v_archive_data_source
qualifying_df["ingestion_date"] = common_functions.get_ingestion_date()
qualifying_df

Unnamed: 0,qualify_id,race_id,driver_id,constructor_id,number,position,q1,q2,q3,data_source,ingestion_date
0,1,18,1,1,22,1,1:26.572,1:25.187,1:26.714,dev,2024-06-12
1,2,18,9,2,4,2,1:26.103,1:25.315,1:26.869,dev,2024-06-12
2,3,18,5,1,23,3,1:25.664,1:25.452,1:27.079,dev,2024-06-12
3,4,18,13,6,2,4,1:25.994,1:25.691,1:27.178,dev,2024-06-12
4,5,18,2,2,3,5,1:25.960,1:25.518,1:27.236,dev,2024-06-12
...,...,...,...,...,...,...,...,...,...,...,...
8689,8730,1047,8,51,7,16,1:37.555,\N,\N,dev,2024-06-12
8690,8731,1047,825,210,20,17,1:37.863,\N,\N,dev,2024-06-12
8691,8732,1047,847,3,63,18,1:38.045,\N,\N,dev,2024-06-12
8692,8733,1047,850,210,51,19,1:38.173,\N,\N,dev,2024-06-12


##### Step 3 - Write to output to processed container in parquet format

In [8]:
qualifying_df.to_csv(f"{configuration.silver_folder_path}/qualifying.csv", index=False)

In [9]:
df_parquet = read_csv(f'{configuration.silver_folder_path}/qualifying.csv')
df_parquet

Unnamed: 0,qualify_id,race_id,driver_id,constructor_id,number,position,q1,q2,q3,data_source,ingestion_date
0,1,18,1,1,22,1,1:26.572,1:25.187,1:26.714,dev,2024-06-12
1,2,18,9,2,4,2,1:26.103,1:25.315,1:26.869,dev,2024-06-12
2,3,18,5,1,23,3,1:25.664,1:25.452,1:27.079,dev,2024-06-12
3,4,18,13,6,2,4,1:25.994,1:25.691,1:27.178,dev,2024-06-12
4,5,18,2,2,3,5,1:25.960,1:25.518,1:27.236,dev,2024-06-12
...,...,...,...,...,...,...,...,...,...,...,...
8689,8730,1047,8,51,7,16,1:37.555,\N,\N,dev,2024-06-12
8690,8731,1047,825,210,20,17,1:37.863,\N,\N,dev,2024-06-12
8691,8732,1047,847,3,63,18,1:38.045,\N,\N,dev,2024-06-12
8692,8733,1047,850,210,51,19,1:38.173,\N,\N,dev,2024-06-12
