### Ingest races.csv file

##### Step 1 - Read the CSV file using the spark dataframe reader

In [1]:
from pandas import read_csv
from lib import configuration
from lib import common_functions

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [3]:
spark = common_functions.get_spark_session()

In [4]:
races_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                  StructField("year", IntegerType(), True),
                                  StructField("round", IntegerType(), True),
                                  StructField("circuitId", IntegerType(), True),
                                  StructField("name", StringType(), True),
                                  StructField("date", DateType(), True),
                                  StructField("time", StringType(), True),
                                  StructField("url", StringType(), True) 
])

In [5]:
races_df = spark.read \
.option("header", True) \
.schema(races_schema) \
.csv(f"{configuration.bronze_folder_path}/races.csv")

races_df = read_csv(f'{configuration.bronze_folder_path}/races.csv')
display(races_df)

Unnamed: 0,raceId,year,round,circuitId,name,date,time,url
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,http://en.wikipedia.org/wiki/2009_Malaysian_Gr...
2,3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,http://en.wikipedia.org/wiki/2009_Chinese_Gran...
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,http://en.wikipedia.org/wiki/2009_Bahrain_Gran...
4,5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,http://en.wikipedia.org/wiki/2009_Spanish_Gran...
...,...,...,...,...,...,...,...,...
1053,1069,2021,18,69,United States Grand Prix,2021-10-24,19:00:00,http://en.wikipedia.org/wiki/2021_United_State...
1054,1070,2021,19,32,Mexico City Grand Prix,2021-10-31,19:00:00,http://en.wikipedia.org/wiki/2021_Mexican_Gran...
1055,1071,2021,20,18,Brazilian Grand Prix,2021-11-07,17:00:00,http://en.wikipedia.org/wiki/2021_Brazilian_Gr...
1056,1072,2021,22,77,Saudi Arabian Grand Prix,2021-12-05,16:00:00,http://en.wikipedia.org/wiki/2021_Saudi_Arabia...


##### Step 2 - Select only the required columns

In [6]:
races_selected_df = races_df[["raceId", "year", "round", "circuitId", "name", "date", "time"]]
display(races_selected_df)

Unnamed: 0,raceId,year,round,circuitId,name,date,time
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00
2,3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00
4,5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00
...,...,...,...,...,...,...,...
1053,1069,2021,18,69,United States Grand Prix,2021-10-24,19:00:00
1054,1070,2021,19,32,Mexico City Grand Prix,2021-10-31,19:00:00
1055,1071,2021,20,18,Brazilian Grand Prix,2021-11-07,17:00:00
1056,1072,2021,22,77,Saudi Arabian Grand Prix,2021-12-05,16:00:00


##### Step 3 - Rename the columns as required

In [7]:
races_renamed_df = races_selected_df.rename(columns={"raceId":"race_id","circuitId":"circuit_id","name":"race_name"})
display(races_renamed_df)

Unnamed: 0,race_id,year,round,circuit_id,race_name,date,time
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00
2,3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00
4,5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00
...,...,...,...,...,...,...,...
1053,1069,2021,18,69,United States Grand Prix,2021-10-24,19:00:00
1054,1070,2021,19,32,Mexico City Grand Prix,2021-10-31,19:00:00
1055,1071,2021,20,18,Brazilian Grand Prix,2021-11-07,17:00:00
1056,1072,2021,22,77,Saudi Arabian Grand Prix,2021-12-05,16:00:00


##### Step - Add ingestion date to the dataframe

In [8]:
races_renamed_df["race_timestamp"] = races_selected_df["date"] + ' ' + races_selected_df["time"]
races_renamed_df["data_source"] = configuration.v_data_source
races_renamed_df["ingestion_date"] = common_functions.get_ingestion_date()
races_renamed_df

Unnamed: 0,race_id,year,round,circuit_id,race_name,date,time,race_timestamp,data_source,ingestion_date
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,2009-03-29 06:00:00,dev,2024-05-15
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,2009-04-05 09:00:00,dev,2024-05-15
2,3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,2009-04-19 07:00:00,dev,2024-05-15
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,2009-04-26 12:00:00,dev,2024-05-15
4,5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,2009-05-10 12:00:00,dev,2024-05-15
...,...,...,...,...,...,...,...,...,...,...
1053,1069,2021,18,69,United States Grand Prix,2021-10-24,19:00:00,2021-10-24 19:00:00,dev,2024-05-15
1054,1070,2021,19,32,Mexico City Grand Prix,2021-10-31,19:00:00,2021-10-31 19:00:00,dev,2024-05-15
1055,1071,2021,20,18,Brazilian Grand Prix,2021-11-07,17:00:00,2021-11-07 17:00:00,dev,2024-05-15
1056,1072,2021,22,77,Saudi Arabian Grand Prix,2021-12-05,16:00:00,2021-12-05 16:00:00,dev,2024-05-15


##### Step 4 - Write data to datalake as parquet

In [9]:
races_renamed_df.to_csv(f"{configuration.silver_folder_path}/races.csv", index=False)

In [10]:
df_parquet = read_csv(f'{configuration.silver_folder_path}/races.csv')
df_parquet

Unnamed: 0,race_id,year,round,circuit_id,race_name,date,time,race_timestamp,data_source,ingestion_date
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,2009-03-29 06:00:00,dev,2024-05-15
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,2009-04-05 09:00:00,dev,2024-05-15
2,3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,2009-04-19 07:00:00,dev,2024-05-15
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,2009-04-26 12:00:00,dev,2024-05-15
4,5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,2009-05-10 12:00:00,dev,2024-05-15
...,...,...,...,...,...,...,...,...,...,...
1053,1069,2021,18,69,United States Grand Prix,2021-10-24,19:00:00,2021-10-24 19:00:00,dev,2024-05-15
1054,1070,2021,19,32,Mexico City Grand Prix,2021-10-31,19:00:00,2021-10-31 19:00:00,dev,2024-05-15
1055,1071,2021,20,18,Brazilian Grand Prix,2021-11-07,17:00:00,2021-11-07 17:00:00,dev,2024-05-15
1056,1072,2021,22,77,Saudi Arabian Grand Prix,2021-12-05,16:00:00,2021-12-05 16:00:00,dev,2024-05-15
