### Ingest lap_times folder

In [1]:
from pandas import read_csv,read_json,concat
from glob import glob
from lib import configuration
from lib import common_functions

##### Step 1 - Read the CSV file using the spark dataframe reader API

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [3]:
spark = common_functions.get_spark_session()

In [4]:
lap_times_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("lap", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("milliseconds", IntegerType(), True)
                                     ])

In [5]:
folder_path = f"{configuration.bronze_folder_path}/lap_times"
all_files = glob(folder_path + "/*.csv")

# dfs = spark.read \
# .option("header", True) \
# .schema(lap_times_schema) \
# .csv(f"{configuration.bronze_folder_path}/lap_times/lap_times_split_1.csv")

dfs = [read_csv(file) for file in all_files]
lap_times_df = concat(dfs, ignore_index=True)
lap_times_df

Unnamed: 0,raceId,driverId,lap,position,time,milliseconds
0,841,20,1,1,1:38.109,98109
1,841,20,2,1,1:33.006,93006
2,841,20,3,1,1:32.713,92713
3,841,20,4,1,1:32.803,92803
4,841,20,5,1,1:32.342,92342
...,...,...,...,...,...,...
490899,1047,815,4,17,1:44.762,104762
490900,1047,815,5,16,1:43.786,103786
490901,1047,815,6,15,1:43.263,103263
490902,1047,815,7,15,1:45.123,105123


##### Step 2 - Rename columns and add new columns
1. Rename driverId and raceId
1. Add ingestion_date with current timestamp

In [6]:
lap_times_df = lap_times_df.rename(columns={"raceId":"race_id","driverId":"driver_Id"})
lap_times_df["data_source"] = configuration.v_data_source
lap_times_df["ingestion_date"] = common_functions.get_ingestion_date()
lap_times_df

Unnamed: 0,race_id,driver_Id,lap,position,time,milliseconds,data_source,ingestion_date
0,841,20,1,1,1:38.109,98109,dev,2024-05-16
1,841,20,2,1,1:33.006,93006,dev,2024-05-16
2,841,20,3,1,1:32.713,92713,dev,2024-05-16
3,841,20,4,1,1:32.803,92803,dev,2024-05-16
4,841,20,5,1,1:32.342,92342,dev,2024-05-16
...,...,...,...,...,...,...,...,...
490899,1047,815,4,17,1:44.762,104762,dev,2024-05-16
490900,1047,815,5,16,1:43.786,103786,dev,2024-05-16
490901,1047,815,6,15,1:43.263,103263,dev,2024-05-16
490902,1047,815,7,15,1:45.123,105123,dev,2024-05-16


##### Step 3 - Write to output to processed container in parquet format

In [7]:
lap_times_df.to_csv(f"{configuration.silver_folder_path}/lap_times.csv", index=False)

In [8]:
df_parquet = read_csv(f'{configuration.silver_folder_path}/lap_times.csv')
df_parquet

Unnamed: 0,race_id,driver_Id,lap,position,time,milliseconds,data_source,ingestion_date
0,841,20,1,1,1:38.109,98109,dev,2024-05-16
1,841,20,2,1,1:33.006,93006,dev,2024-05-16
2,841,20,3,1,1:32.713,92713,dev,2024-05-16
3,841,20,4,1,1:32.803,92803,dev,2024-05-16
4,841,20,5,1,1:32.342,92342,dev,2024-05-16
...,...,...,...,...,...,...,...,...
490899,1047,815,4,17,1:44.762,104762,dev,2024-05-16
490900,1047,815,5,16,1:43.786,103786,dev,2024-05-16
490901,1047,815,6,15,1:43.263,103263,dev,2024-05-16
490902,1047,815,7,15,1:45.123,105123,dev,2024-05-16
