In [0]:
from pyspark.sql.functions import to_date, to_timestamp, round, col

In [0]:
# Loading a DataFrame using the format method
# This is preferred because this is the standard format to load data from any format
# Actions: start a spark job
raw_fire_df = spark.read \
            .format("csv") \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .load("/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv")

In [0]:
# There are two problems with this DataFrame:
# 1. Column names are not standardized (Whitespaces)
# 2. Data fields are of string type
display(raw_fire_df.limit(10))

Call Number,Unit ID,Incident Number,CallType,Call Date,Watch Date,Call Final Disposition,Available DtTm,Address,City,Zipcode of Incident,Battalion,Station Area,Box,OrigPriority,Priority,Final Priority,ALS Unit,Call Type Group,NumAlarms,UnitType,Unit sequence in call dispatch,Fire Prevention District,Supervisor District,Neighborhood,Location,RowID,Delay
20110014,M29,2003234,Medical Incident,2002-01-11,2002-01-10,Other,01/11/2002 01:58:43 AM,10TH ST/MARKET ST,SF,94103,B02,36,2338,1,1,2,True,,1,MEDIC,1,2,6,Tenderloin,"(37.7765408927183, -122.417501464907)",020110014-M29,5.233333333333333
20110015,M08,2003233,Medical Incident,2002-01-11,2002-01-10,Other,01/11/2002 02:10:17 AM,300 Block of 5TH ST,SF,94107,B03,8,2243,1,1,2,True,,1,MEDIC,1,3,6,South of Market,"(37.7792841462441, -122.402061300134)",020110015-M08,3.083333333333333
20110016,B02,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:47:00 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,CHIEF,6,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-B02,3.05
20110016,B04,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:51:54 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,CHIEF,3,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-B04,2.316666666666667
20110016,D2,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:47:00 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,CHIEF,4,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-D2,3.0166666666666666
20110016,E03,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:47:00 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,ENGINE,7,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-E03,2.683333333333333
20110016,E38,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:51:17 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,ENGINE,1,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-E38,2.1
20110016,E41,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:47:00 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,ENGINE,8,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-E41,2.716666666666667
20110016,M03,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:46:38 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,True,,1,MEDIC,10,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-M03,2.7666666666666666
20110016,RS1,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:46:57 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,RESCUE SQUAD,9,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-RS1,3.2666666666666666


In [0]:
# Transformation: modify the DataFrame without activating a spark job -- e.g., withColumnRenamed
# Spark DataFrames are immutable, so we need to transform an existing DataFrame and asign it to a new DataFrame
renamed_fire_df = raw_fire_df \
                    .withColumnRenamed("Call Number", "CallNumber") \
                    .withColumnRenamed("Unit ID", "UnitID") \
                    .withColumnRenamed("Incident Number", "IncidentNumber") \
                    .withColumnRenamed("Call Date", "CallDate") \
                    .withColumnRenamed("Watch Date", "WatchDate") \
                    .withColumnRenamed("Call Final Disposition", "CallFinalDisposition") \
                    .withColumnRenamed("Available DtTm", "AvailableDtTm") \
                    .withColumnRenamed("Zipcode of Incident", "Zipcode") \
                    .withColumnRenamed("Station Area", "StationArea") \
                    .withColumnRenamed("Final Priority", "FinalPriority") \
                    .withColumnRenamed("ALS Unit", "ALSUnit") \
                    .withColumnRenamed("Call Type Group", "CallTypeGroup") \
                    .withColumnRenamed("Unit sequence in call dispatch", "UnitSequenceInCallDispatch") \
                    .withColumnRenamed("Fire Prevention District", "FirePreventionDistrict") \
                    .withColumnRenamed("Supervisor District", "SupervisorDistrict")

In [0]:
display(renamed_fire_df.limit(10))

CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,CallFinalDisposition,AvailableDtTm,Address,City,Zipcode,Battalion,StationArea,Box,OrigPriority,Priority,FinalPriority,ALSUnit,CallTypeGroup,NumAlarms,UnitType,UnitSequenceInCallDispatch,FirePreventionDistrict,SupervisorDistrict,Neighborhood,Location,RowID,Delay
20110014,M29,2003234,Medical Incident,2002-01-11,2002-01-10,Other,01/11/2002 01:58:43 AM,10TH ST/MARKET ST,SF,94103,B02,36,2338,1,1,2,True,,1,MEDIC,1,2,6,Tenderloin,"(37.7765408927183, -122.417501464907)",020110014-M29,5.233333333333333
20110015,M08,2003233,Medical Incident,2002-01-11,2002-01-10,Other,01/11/2002 02:10:17 AM,300 Block of 5TH ST,SF,94107,B03,8,2243,1,1,2,True,,1,MEDIC,1,3,6,South of Market,"(37.7792841462441, -122.402061300134)",020110015-M08,3.083333333333333
20110016,B02,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:47:00 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,CHIEF,6,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-B02,3.05
20110016,B04,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:51:54 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,CHIEF,3,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-B04,2.316666666666667
20110016,D2,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:47:00 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,CHIEF,4,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-D2,3.0166666666666666
20110016,E03,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:47:00 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,ENGINE,7,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-E03,2.683333333333333
20110016,E38,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:51:17 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,ENGINE,1,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-E38,2.1
20110016,E41,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:47:00 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,ENGINE,8,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-E41,2.716666666666667
20110016,M03,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:46:38 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,True,,1,MEDIC,10,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-M03,2.7666666666666666
20110016,RS1,2003235,Structure Fire,2002-01-11,2002-01-10,Other,01/11/2002 01:46:57 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,RESCUE SQUAD,9,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-RS1,3.2666666666666666


In [0]:
# Utility Methods: They don't transform a DataFrame nor activate a spark job
renamed_fire_df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: date (nullable = true)
 |-- WatchDate: date (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OrigPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 |-- Supe

In [0]:
# withColumn is a common transformation
fire_df = renamed_fire_df \
            .withColumn("AvailableDtTm", to_timestamp("AvailableDtTm", "MM/dd/yyyy hh:mm:ss a")) \
            .withColumn("Delay", round(col("Delay"), 2))

In [0]:
display(fire_df.limit(10))

CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,CallFinalDisposition,AvailableDtTm,Address,City,Zipcode,Battalion,StationArea,Box,OrigPriority,Priority,FinalPriority,ALSUnit,CallTypeGroup,NumAlarms,UnitType,UnitSequenceInCallDispatch,FirePreventionDistrict,SupervisorDistrict,Neighborhood,Location,RowID,Delay
20110014,M29,2003234,Medical Incident,2002-01-11,2002-01-10,Other,2002-01-11T01:58:43Z,10TH ST/MARKET ST,SF,94103,B02,36,2338,1,1,2,True,,1,MEDIC,1,2,6,Tenderloin,"(37.7765408927183, -122.417501464907)",020110014-M29,5.23
20110015,M08,2003233,Medical Incident,2002-01-11,2002-01-10,Other,2002-01-11T02:10:17Z,300 Block of 5TH ST,SF,94107,B03,8,2243,1,1,2,True,,1,MEDIC,1,3,6,South of Market,"(37.7792841462441, -122.402061300134)",020110015-M08,3.08
20110016,B02,2003235,Structure Fire,2002-01-11,2002-01-10,Other,2002-01-11T01:47:00Z,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,CHIEF,6,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-B02,3.05
20110016,B04,2003235,Structure Fire,2002-01-11,2002-01-10,Other,2002-01-11T01:51:54Z,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,CHIEF,3,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-B04,2.32
20110016,D2,2003235,Structure Fire,2002-01-11,2002-01-10,Other,2002-01-11T01:47:00Z,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,CHIEF,4,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-D2,3.02
20110016,E03,2003235,Structure Fire,2002-01-11,2002-01-10,Other,2002-01-11T01:47:00Z,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,ENGINE,7,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-E03,2.68
20110016,E38,2003235,Structure Fire,2002-01-11,2002-01-10,Other,2002-01-11T01:51:17Z,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,ENGINE,1,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-E38,2.1
20110016,E41,2003235,Structure Fire,2002-01-11,2002-01-10,Other,2002-01-11T01:47:00Z,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,ENGINE,8,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-E41,2.72
20110016,M03,2003235,Structure Fire,2002-01-11,2002-01-10,Other,2002-01-11T01:46:38Z,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,True,,1,MEDIC,10,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-M03,2.77
20110016,RS1,2003235,Structure Fire,2002-01-11,2002-01-10,Other,2002-01-11T01:46:57Z,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,RESCUE SQUAD,9,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-RS1,3.27


In [0]:
fire_df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: date (nullable = true)
 |-- WatchDate: date (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: timestamp (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OrigPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 |-- S