### Read the CSV file using spark dataframe reader API

In [0]:
# load the csv file to a dataframe called circuits_df from mounted space. We can use dbutils.fs.mounts() to check the mounts and its location.
circuits_df = spark.read.options(header='true').csv('dbfs:/mnt/dldatabrickscoursedev001/raw/circuits.csv')


In [0]:
# show the data for that dataframe
circuits_df.show()

In [0]:
# alternate option of showing the csv data which has been loaded to dataframe.
display(circuits_df)

In [0]:
# check the type of the data frame which shows result as pyspark.sql.dataframe.DataFrame
type(circuits_df)
# circuits_df.printSchema() 

In [0]:
# describe the data for that data frame
circuits_df.describe().show()

In [0]:
display(dbutils.fs.mounts())

In [0]:
# display(dbutils.fs.ls('/mnt/dldatabrickscoursedev001/raw'))

# OR

%fs
ls /mnt/dldatabrickscoursedev001/raw

In [0]:
# show the schema of the data frame
circuits_df.printSchema()


In [0]:
circuits_df = spark.read    \
    .options(header='true') \
    .option("inferSchema", "true")  \
    .csv('dbfs:/mnt/dldatabrickscoursedev001/raw/circuits.csv')

# OR

circuits_df = spark.read.options(header="true", inferSchema="true").csv("dbfs:/mnt/dldatabrickscoursedev001/raw/circuits.csv")

### Read the CSV file using the spark dataframe reader

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType


In [0]:
circuits_schema = StructType( fields= [
    StructField("circuitId", IntegerType(), True),
    StructField("circuitRef", StringType(), True),
    StructField("name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("country", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lng", DoubleType(), True),
    StructField("alt", IntegerType(), True)
])

In [0]:
circuits_df = spark.read    \
    .options(header='true') \
    .schema(circuits_schema)  \
    .csv('dbfs:/mnt/dldatabrickscoursedev001/raw/circuits.csv')

## Step 2 - Select only the required columns

In [0]:
from pyspark.sql.functions import col

In [0]:
# select the columns from the data frame(option 1)
circuits_df_selected = circuits_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"), col("alt"))

In [0]:
# select the columns from the data frame (option 2)
circuits_df_selected = circuits_df.select("circuitId", "circuitRef", "name", "location", "country", "lat", "lng", "alt")

In [0]:
# Select the columns from the data frame (option 3)
circuits_df_selected = circuits_df.select(circuits_df["circuitId"], circuits_df["circuitRef"], circuits_df["name"], circuits_df["location"], circuits_df["country"], circuits_df["lat"], circuits_df["lng"], circuits_df["alt"])

In [0]:
display(circuits_df_selected)

In [0]:
circuits_df_selected = circuits_df.select(col("circuitId").alias("circuit_id"), col("circuitRef").alias("circuit_ref"), col("name"), col("location"), col("country"), col("lat").alias("latitude"), col("lng").alias("longitude"), col("alt").alias("altitude"))

In [0]:
display(circuits_df_selected)

In [0]:
circuits_df_selected.printSchema()