#### Read the CSV file using spark dataframe reader API, here we are also calling a different notebook for getting the configs and calling a function to add the new column as current time stamp.

#### Step 1. Load the CSV file

In [0]:
%run "../includes/configuration"

In [0]:
dbutils.widgets.text("p_source_file", "")


In [0]:
v_source_file = dbutils.widgets.get("p_source_file")

In [0]:
circuits_df = spark.read    \
    .options(header='true') \
    .option("inferSchema", "true")  \
    .csv(f'{raw_folder_path}/circuits.csv')

# OR

# circuits_df = spark.read.options(header="true", inferSchema="true").csv("/mnt/dldatabrickscoursedev001/raw/circuits.csv")

#### 
- Set the schema for the data to specific one of ours defined schema rather than infering that and using that way. In that way if some changes happens in the input data schema we will just consume that rather than failing the process as it does not match our set schema.

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType


In [0]:
circuits_schema = StructType( fields= [
    StructField("circuitId", IntegerType(), True),
    StructField("circuitRef", StringType(), True),
    StructField("name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("country", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lng", DoubleType(), True),
    StructField("alt", IntegerType(), True)
])

In [0]:
circuits_df = spark.read    \
    .options(header='true') \
    .schema(circuits_schema)  \
    .csv(f'{raw_folder_path}/circuits.csv')

#### Step 2 - Select only the required columns

In [0]:
from pyspark.sql.functions import col

In [0]:
# select the columns from the data frame(option 1)
circuits_selected_df = circuits_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"), col("alt"))

#### Step 3 - Rename the columns

In [0]:
from pyspark.sql.functions import current_timestamp , lit

In [0]:
circuits_renamed_df = circuits_selected_df \
    .withColumnRenamed("lat", "latitude") \
    .withColumnRenamed("lng", "longitude")  \
    .withColumnRenamed("alt", "altitude")  \
    .withColumnRenamed("circuitId", "circuit_id")   \
    .withColumnRenamed("circuitRef", "circuit_ref") \
    .withColumn("source_data", lit(v_source_file))


#### Step 4 - Add the ingestion date column

In [0]:
%run "../functions/common_functions"

In [0]:
circuits_final_df = add_ingest_date(circuits_renamed_df)

#### Step 5 - Write the data to parquet file

In [0]:
# write data to parquet file
circuits_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/circuits")


In [0]:
df = spark.read.parquet(f"{processed_folder_path}/circuits")

In [0]:
display(df)

In [0]:
dbutils.notebook.exit("Success")