### Ingest lap_times folder

In [1]:
from pandas import read_csv,concat
from glob import glob
from lib import configuration
from lib import common_functions

##### Step 1 - Read the CSV file using the spark dataframe reader API

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [3]:
spark = common_functions.get_spark_session()

In [4]:
races_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                  StructField("year", IntegerType(), True),
                                  StructField("round", IntegerType(), True),
                                  StructField("circuitId", IntegerType(), True),
                                  StructField("name", StringType(), True),
                                  StructField("date", DateType(), True),
                                  StructField("time", StringType(), True),
                                  StructField("url", StringType(), True) 
])

In [5]:
folder_path = f"{configuration.bronze_folder_path}/races"
all_files = glob(folder_path + "/*.csv")

dfs = [read_csv(file) for file in all_files]
races_df = concat(dfs, ignore_index=True)
races_df

Unnamed: 0,raceId,year,round,circuitId,name,date,time,url
0,1,1950,1,Silverstone Circuit,British Grand Prix,1950-05-13,06:00:00,http://en.wikipedia.org/wiki/1950_British_Gran...
1,2,1950,2,Circuit de Monaco,Monaco Grand Prix,1950-05-21,06:00:00,http://en.wikipedia.org/wiki/1950_Monaco_Grand...
2,3,1950,3,Indianapolis Motor Speedway,Indianapolis 500,1950-05-30,06:00:00,http://en.wikipedia.org/wiki/1950_Indianapolis...
3,4,1950,4,Circuit Bremgarten,Swiss Grand Prix,1950-06-04,06:00:00,http://en.wikipedia.org/wiki/1950_Swiss_Grand_...
4,5,1950,5,Circuit de Spa-Francorchamps,Belgian Grand Prix,1950-06-18,06:00:00,http://en.wikipedia.org/wiki/1950_Belgian_Gran...
...,...,...,...,...,...,...,...,...
1120,1121,2024,20,Autódromo Hermanos Rodríguez,Mexico City Grand Prix,2024-10-27,06:00:00,https://en.wikipedia.org/wiki/2024_Mexico_City...
1121,1122,2024,21,Autódromo José Carlos Pace,São Paulo Grand Prix,2024-11-03,06:00:00,https://en.wikipedia.org/wiki/2024_S%C3%A3o_Pa...
1122,1123,2024,22,Las Vegas Strip Street Circuit,Las Vegas Grand Prix,2024-11-23,06:00:00,https://en.wikipedia.org/wiki/2024_Las_Vegas_G...
1123,1124,2024,23,Losail International Circuit,Qatar Grand Prix,2024-12-01,06:00:00,https://en.wikipedia.org/wiki/2024_Qatar_Grand...


##### Step 2 - Select only the required columns

In [6]:
races_selected_df = races_df[["raceId", "year", "round", "circuitId", "name", "date", "time"]]
display(races_selected_df)

Unnamed: 0,raceId,year,round,circuitId,name,date,time
0,1,1950,1,Silverstone Circuit,British Grand Prix,1950-05-13,06:00:00
1,2,1950,2,Circuit de Monaco,Monaco Grand Prix,1950-05-21,06:00:00
2,3,1950,3,Indianapolis Motor Speedway,Indianapolis 500,1950-05-30,06:00:00
3,4,1950,4,Circuit Bremgarten,Swiss Grand Prix,1950-06-04,06:00:00
4,5,1950,5,Circuit de Spa-Francorchamps,Belgian Grand Prix,1950-06-18,06:00:00
...,...,...,...,...,...,...,...
1120,1121,2024,20,Autódromo Hermanos Rodríguez,Mexico City Grand Prix,2024-10-27,06:00:00
1121,1122,2024,21,Autódromo José Carlos Pace,São Paulo Grand Prix,2024-11-03,06:00:00
1122,1123,2024,22,Las Vegas Strip Street Circuit,Las Vegas Grand Prix,2024-11-23,06:00:00
1123,1124,2024,23,Losail International Circuit,Qatar Grand Prix,2024-12-01,06:00:00


##### Step 3 - Rename columns and add new columns
1. Rename driverId and raceId
1. Add ingestion_date with current timestamp

In [7]:
races_renamed_df = races_selected_df.rename(columns={"raceId":"race_id","circuitId":"circuit_id","name":"race_name"})
display(races_renamed_df)

Unnamed: 0,race_id,year,round,circuit_id,race_name,date,time
0,1,1950,1,Silverstone Circuit,British Grand Prix,1950-05-13,06:00:00
1,2,1950,2,Circuit de Monaco,Monaco Grand Prix,1950-05-21,06:00:00
2,3,1950,3,Indianapolis Motor Speedway,Indianapolis 500,1950-05-30,06:00:00
3,4,1950,4,Circuit Bremgarten,Swiss Grand Prix,1950-06-04,06:00:00
4,5,1950,5,Circuit de Spa-Francorchamps,Belgian Grand Prix,1950-06-18,06:00:00
...,...,...,...,...,...,...,...
1120,1121,2024,20,Autódromo Hermanos Rodríguez,Mexico City Grand Prix,2024-10-27,06:00:00
1121,1122,2024,21,Autódromo José Carlos Pace,São Paulo Grand Prix,2024-11-03,06:00:00
1122,1123,2024,22,Las Vegas Strip Street Circuit,Las Vegas Grand Prix,2024-11-23,06:00:00
1123,1124,2024,23,Losail International Circuit,Qatar Grand Prix,2024-12-01,06:00:00


##### Step - Add ingestion date to the dataframe

In [8]:
races_renamed_df["race_id_join"] = races_selected_df["name"] + races_selected_df["year"].astype(str)
races_renamed_df["race_timestamp"] = races_selected_df["date"] + ' ' + races_selected_df["time"]
races_renamed_df["data_source"] = configuration.v_data_source
races_renamed_df["ingestion_date"] = common_functions.get_ingestion_date()
races_renamed_df

Unnamed: 0,race_id,year,round,circuit_id,race_name,date,time,race_id_join,race_timestamp,data_source,ingestion_date
0,1,1950,1,Silverstone Circuit,British Grand Prix,1950-05-13,06:00:00,British Grand Prix1950,1950-05-13 06:00:00,api,2024-06-12
1,2,1950,2,Circuit de Monaco,Monaco Grand Prix,1950-05-21,06:00:00,Monaco Grand Prix1950,1950-05-21 06:00:00,api,2024-06-12
2,3,1950,3,Indianapolis Motor Speedway,Indianapolis 500,1950-05-30,06:00:00,Indianapolis 5001950,1950-05-30 06:00:00,api,2024-06-12
3,4,1950,4,Circuit Bremgarten,Swiss Grand Prix,1950-06-04,06:00:00,Swiss Grand Prix1950,1950-06-04 06:00:00,api,2024-06-12
4,5,1950,5,Circuit de Spa-Francorchamps,Belgian Grand Prix,1950-06-18,06:00:00,Belgian Grand Prix1950,1950-06-18 06:00:00,api,2024-06-12
...,...,...,...,...,...,...,...,...,...,...,...
1120,1121,2024,20,Autódromo Hermanos Rodríguez,Mexico City Grand Prix,2024-10-27,06:00:00,Mexico City Grand Prix2024,2024-10-27 06:00:00,api,2024-06-12
1121,1122,2024,21,Autódromo José Carlos Pace,São Paulo Grand Prix,2024-11-03,06:00:00,São Paulo Grand Prix2024,2024-11-03 06:00:00,api,2024-06-12
1122,1123,2024,22,Las Vegas Strip Street Circuit,Las Vegas Grand Prix,2024-11-23,06:00:00,Las Vegas Grand Prix2024,2024-11-23 06:00:00,api,2024-06-12
1123,1124,2024,23,Losail International Circuit,Qatar Grand Prix,2024-12-01,06:00:00,Qatar Grand Prix2024,2024-12-01 06:00:00,api,2024-06-12


##### Step 3 - Write to output to processed container in parquet format

In [9]:
races_renamed_df.to_csv(f"{configuration.silver_folder_path}/races.csv", index=False)

In [10]:
df_parquet = read_csv(f'{configuration.silver_folder_path}/races.csv')
df_parquet

Unnamed: 0,race_id,year,round,circuit_id,race_name,date,time,race_id_join,race_timestamp,data_source,ingestion_date
0,1,1950,1,Silverstone Circuit,British Grand Prix,1950-05-13,06:00:00,British Grand Prix1950,1950-05-13 06:00:00,api,2024-06-12
1,2,1950,2,Circuit de Monaco,Monaco Grand Prix,1950-05-21,06:00:00,Monaco Grand Prix1950,1950-05-21 06:00:00,api,2024-06-12
2,3,1950,3,Indianapolis Motor Speedway,Indianapolis 500,1950-05-30,06:00:00,Indianapolis 5001950,1950-05-30 06:00:00,api,2024-06-12
3,4,1950,4,Circuit Bremgarten,Swiss Grand Prix,1950-06-04,06:00:00,Swiss Grand Prix1950,1950-06-04 06:00:00,api,2024-06-12
4,5,1950,5,Circuit de Spa-Francorchamps,Belgian Grand Prix,1950-06-18,06:00:00,Belgian Grand Prix1950,1950-06-18 06:00:00,api,2024-06-12
...,...,...,...,...,...,...,...,...,...,...,...
1120,1121,2024,20,Autódromo Hermanos Rodríguez,Mexico City Grand Prix,2024-10-27,06:00:00,Mexico City Grand Prix2024,2024-10-27 06:00:00,api,2024-06-12
1121,1122,2024,21,Autódromo José Carlos Pace,São Paulo Grand Prix,2024-11-03,06:00:00,São Paulo Grand Prix2024,2024-11-03 06:00:00,api,2024-06-12
1122,1123,2024,22,Las Vegas Strip Street Circuit,Las Vegas Grand Prix,2024-11-23,06:00:00,Las Vegas Grand Prix2024,2024-11-23 06:00:00,api,2024-06-12
1123,1124,2024,23,Losail International Circuit,Qatar Grand Prix,2024-12-01,06:00:00,Qatar Grand Prix2024,2024-12-01 06:00:00,api,2024-06-12
