### Ingest drivers.json file

##### Step 1 - Read the JSON file using the spark dataframe reader API

In [1]:
from pandas import read_csv,read_json
from lib import configuration
from lib import common_functions

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [3]:
spark = common_functions.get_spark_session()

In [4]:
name_schema = StructType(fields=[StructField("forename", StringType(), True),
                                 StructField("surname", StringType(), True)])

In [5]:
drivers_schema = StructType(fields=[StructField("driverId", IntegerType(), False),
                                    StructField("driverRef", StringType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("code", StringType(), True),
                                    StructField("name", name_schema),
                                    StructField("dob", DateType(), True),
                                    StructField("nationality", StringType(), True),
                                    StructField("url", StringType(), True)])

In [6]:
drivers_df = spark.read \
.schema(drivers_schema) \
.json(f"{configuration.bronze_folder_path}/drivers.json")

drivers_df = read_json(f'{configuration.bronze_folder_path}/drivers.json', lines=True)
display(drivers_df)

Unnamed: 0,driverId,driverRef,number,code,name,dob,nationality,url
0,1,abate,999,CODE,Carlo Abate,1932-07-10,Italian,http://en.wikipedia.org/wiki/Carlo_Mario_Abate
1,2,abecassis,999,CODE,George Abecassis,1913-03-21,British,http://en.wikipedia.org/wiki/George_Abecassis
2,3,acheson,999,CODE,Kenny Acheson,1957-11-27,British,http://en.wikipedia.org/wiki/Kenny_Acheson
3,4,adams,999,CODE,Philippe Adams,1969-11-19,Belgian,http://en.wikipedia.org/wiki/Philippe_Adams
4,5,ader,999,CODE,Walt Ader,1913-12-15,American,http://en.wikipedia.org/wiki/Walt_Ader
...,...,...,...,...,...,...,...,...
854,855,zapico,999,CODE,Emilio Zapico,1944-05-27,Spanish,http://en.wikipedia.org/wiki/Emilio_Zapico
855,856,zhou,999,CODE,Guanyu Zhou,1999-05-30,Chinese,http://en.wikipedia.org/wiki/Zhou_Guanyu
856,857,zonta,999,CODE,Ricardo Zonta,1976-03-23,Brazilian,http://en.wikipedia.org/wiki/Ricardo_Zonta
857,858,zorzi,999,CODE,Renzo Zorzi,1946-12-12,Italian,http://en.wikipedia.org/wiki/Renzo_Zorzi


##### Step 2 - Rename columns and add new columns
1. driverId renamed to driver_id  
1. driverRef renamed to driver_ref  
1. ingestion date added
1. name added with concatenation of forename and surname

In [7]:
drivers_df = drivers_df.drop(columns='url')
#drivers_df["name"] = drivers_df["name"].apply(lambda x:x['forename']) + ' ' + drivers_df["name"].apply(lambda x:x['surname'])
drivers_df["data_source"] = configuration.v_data_source
drivers_df["ingestion_date"] = common_functions.get_ingestion_date()
drivers_df

Unnamed: 0,driverId,driverRef,number,code,name,dob,nationality,data_source,ingestion_date
0,1,abate,999,CODE,Carlo Abate,1932-07-10,Italian,dev,2024-06-12
1,2,abecassis,999,CODE,George Abecassis,1913-03-21,British,dev,2024-06-12
2,3,acheson,999,CODE,Kenny Acheson,1957-11-27,British,dev,2024-06-12
3,4,adams,999,CODE,Philippe Adams,1969-11-19,Belgian,dev,2024-06-12
4,5,ader,999,CODE,Walt Ader,1913-12-15,American,dev,2024-06-12
...,...,...,...,...,...,...,...,...,...
854,855,zapico,999,CODE,Emilio Zapico,1944-05-27,Spanish,dev,2024-06-12
855,856,zhou,999,CODE,Guanyu Zhou,1999-05-30,Chinese,dev,2024-06-12
856,857,zonta,999,CODE,Ricardo Zonta,1976-03-23,Brazilian,dev,2024-06-12
857,858,zorzi,999,CODE,Renzo Zorzi,1946-12-12,Italian,dev,2024-06-12


In [8]:
drivers_final_df = drivers_df.rename(columns={"driverId":"driver_id","driverRef":"driver_ref"})
drivers_final_df

Unnamed: 0,driver_id,driver_ref,number,code,name,dob,nationality,data_source,ingestion_date
0,1,abate,999,CODE,Carlo Abate,1932-07-10,Italian,dev,2024-06-12
1,2,abecassis,999,CODE,George Abecassis,1913-03-21,British,dev,2024-06-12
2,3,acheson,999,CODE,Kenny Acheson,1957-11-27,British,dev,2024-06-12
3,4,adams,999,CODE,Philippe Adams,1969-11-19,Belgian,dev,2024-06-12
4,5,ader,999,CODE,Walt Ader,1913-12-15,American,dev,2024-06-12
...,...,...,...,...,...,...,...,...,...
854,855,zapico,999,CODE,Emilio Zapico,1944-05-27,Spanish,dev,2024-06-12
855,856,zhou,999,CODE,Guanyu Zhou,1999-05-30,Chinese,dev,2024-06-12
856,857,zonta,999,CODE,Ricardo Zonta,1976-03-23,Brazilian,dev,2024-06-12
857,858,zorzi,999,CODE,Renzo Zorzi,1946-12-12,Italian,dev,2024-06-12


##### Step 4 - Write to output to processed container in parquet format

In [9]:
drivers_final_df.to_csv(f"{configuration.silver_folder_path}/drivers.csv", index=False)

In [10]:
df_parquet = read_csv(f'{configuration.silver_folder_path}/drivers.csv')
df_parquet

Unnamed: 0,driver_id,driver_ref,number,code,name,dob,nationality,data_source,ingestion_date
0,1,abate,999,CODE,Carlo Abate,1932-07-10,Italian,dev,2024-06-12
1,2,abecassis,999,CODE,George Abecassis,1913-03-21,British,dev,2024-06-12
2,3,acheson,999,CODE,Kenny Acheson,1957-11-27,British,dev,2024-06-12
3,4,adams,999,CODE,Philippe Adams,1969-11-19,Belgian,dev,2024-06-12
4,5,ader,999,CODE,Walt Ader,1913-12-15,American,dev,2024-06-12
...,...,...,...,...,...,...,...,...,...
854,855,zapico,999,CODE,Emilio Zapico,1944-05-27,Spanish,dev,2024-06-12
855,856,zhou,999,CODE,Guanyu Zhou,1999-05-30,Chinese,dev,2024-06-12
856,857,zonta,999,CODE,Ricardo Zonta,1976-03-23,Brazilian,dev,2024-06-12
857,858,zorzi,999,CODE,Renzo Zorzi,1946-12-12,Italian,dev,2024-06-12
