### Read the CSV file using spark dataframe reader API

#### Step 1. Load the CSV file

In [0]:
# load the csv file to a dataframe called circuits_df from mounted space. We can use dbutils.fs.mounts() to check the mounts and its location.
circuits_df = spark.read.options(header='true').csv('dbfs:/mnt/dldatabrickscoursedev001/raw/circuits.csv')


In [0]:
# show the data for that dataframe. this show() functions shows first 20 rows if not specified.
circuits_df.show()

In [0]:
# alternate option of showing the csv data which has been loaded to dataframe.
display(circuits_df)

In [0]:
# check the type of the data frame which shows result as pyspark.sql.dataframe.DataFrame
type(circuits_df)
# circuits_df.printSchema() 

In [0]:
# describe the data for that data frame, so that we can get an idea of data types for each column.
circuits_df.describe().show()

In [0]:
# show the schema of the data frame
circuits_df.printSchema()


In [0]:
circuits_df = spark.read    \
    .options(header='true') \
    .option("inferSchema", "true")  \
    .csv('dbfs:/mnt/dldatabrickscoursedev001/raw/circuits.csv')

# OR

circuits_df = spark.read.options(header="true", inferSchema="true").csv("dbfs:/mnt/dldatabrickscoursedev001/raw/circuits.csv")

#### 
- Set the schema for the data to specific one of ours defined schema rather than infering that and using that way. In that way if some changes happens in the input data schema we will just consume that rather than failing the process as it does not match our set schema.

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType


In [0]:
circuits_schema = StructType( fields= [
    StructField("circuitId", IntegerType(), True),
    StructField("circuitRef", StringType(), True),
    StructField("name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("country", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lng", DoubleType(), True),
    StructField("alt", IntegerType(), True)
])

In [0]:
circuits_df = spark.read    \
    .options(header='true') \
    .schema(circuits_schema)  \
    .csv('dbfs:/mnt/dldatabrickscoursedev001/raw/circuits.csv')

#### Step 2 - Select only the required columns

In [0]:
from pyspark.sql.functions import col

In [0]:
# select the columns from the data frame(option 1)
circuits_selected_df = circuits_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"), col("alt"))

In [0]:
# select the columns from the data frame (option 2)
circuits_selected_df = circuits_df.select("circuitId", "circuitRef", "name", "location", "country", "lat", "lng", "alt")

In [0]:
# Select the columns from the data frame (option 3)
circuits_selected_df = circuits_df.select(circuits_df["circuitId"], circuits_df["circuitRef"], circuits_df["name"], circuits_df["location"], circuits_df["country"], circuits_df["lat"], circuits_df["lng"], circuits_df["alt"])

In [0]:
display(circuits_selected_df)

#### Step 3 - Rename the columns

In [0]:
circuits_renamed_df = circuits_selected_df \
    .withColumnRenamed("lat", "latitude") \
    .withColumnRenamed("lng", "longitude")  \
    .withColumnRenamed("alt", "altitude")  \
    .withColumnRenamed("circuitId", "circuit_id")   \
    .withColumnRenamed("circuitRef", "circuit_ref")


In [0]:
display(circuits_renamed_df)

#### Step 4 - Add the ingestion date column

In [0]:
from pyspark.sql.functions import current_timestamp , lit


In [0]:
# command if we have to add a literal value as dev001 as a column values for all rows. We need to import lit from library pyspark.sql.functions.
circuits_final_df = circuits_renamed_df.withColumn("ingestion_date", current_timestamp()).withColumn("env", lit("dev001"))

In [0]:
circuits_final_df = circuits_renamed_df.withColumn("ingestion_date", current_timestamp())

In [0]:
display(circuits_final_df)

#### Step 5 - Write the data to parquet file

In [0]:
# write data to parquet file
circuits_final_df.write.mode("overwrite").parquet("/mnt/dldatabrickscoursedev001/processed/circuits")
