In [1]:
import findspark
findspark.init() 

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
spark = (SparkSession.builder.appName('readNwrite').getOrCreate())

In [4]:
fire_schema = StructType([StructField('CallNumber', IntegerType(), True),
                        StructField('UnitID', StringType(), True),
                        StructField('IncidentNumber', IntegerType(), True),
                        StructField('CallType', StringType(), True),
                        StructField('CallDate', StringType(), True),
                        StructField('WatchDate', StringType(), True),
                        StructField('CallFinalDisposition', StringType(), True),
                        StructField('AvailableDtTm', StringType(), True),
                        StructField('Address', StringType(), True),
                        StructField('City', StringType(), True),
                        StructField('Zipcode', IntegerType(), True),
                        StructField('Battalion', StringType(), True),
                        StructField('StationArea', StringType(), True),
                        StructField('Box', StringType(), True),
                        StructField('OriginalPriority', StringType(), True),
                        StructField('Priority', StringType(), True),
                        StructField('FinalPriority', IntegerType(), True),
                        StructField('ALSUnit', BooleanType(), True),
                        StructField('CallTypeGroup', StringType(), True),
                        StructField('NumAlarms', IntegerType(), True),
                        StructField('UnitType', StringType(), True),
                        StructField('UnitSequenceInCallDispatch', IntegerType(), True),
                        StructField('FirePreventionDistrict', StringType(), True),
                        StructField('SupervisorDistrict', StringType(), True),
                        StructField('Neighborhood', StringType(), True),
                        StructField('Location', StringType(), True),
                        StructField('RowID', StringType(), True),
                        StructField('Delay', FloatType(), True)])

In [5]:
fire_df = spark.read.csv('sf-fire-calls.csv',header=True, schema=fire_schema)

In [6]:
fire_df.columns

['CallNumber',
 'UnitID',
 'IncidentNumber',
 'CallType',
 'CallDate',
 'WatchDate',
 'CallFinalDisposition',
 'AvailableDtTm',
 'Address',
 'City',
 'Zipcode',
 'Battalion',
 'StationArea',
 'Box',
 'OriginalPriority',
 'Priority',
 'FinalPriority',
 'ALSUnit',
 'CallTypeGroup',
 'NumAlarms',
 'UnitType',
 'UnitSequenceInCallDispatch',
 'FirePreventionDistrict',
 'SupervisorDistrict',
 'Neighborhood',
 'Location',
 'RowID',
 'Delay']

In [6]:
fire_df.select(col('CallDate')).show(10, False)

+----------+
|CallDate  |
+----------+
|01/11/2002|
|01/11/2002|
|01/11/2002|
|01/11/2002|
|01/11/2002|
|01/11/2002|
|01/11/2002|
|01/11/2002|
|01/11/2002|
|01/11/2002|
+----------+
only showing top 10 rows



In [8]:
parquet_path = "C:\Data\PySpark\parquet"
fire_df.write.format("parquet").save(parquet_path)

In [9]:
few_fire_df = (fire_df.select("IncidentNumber","AvailableDtTm","CallType").where(col("CallType")!= "MedicalIncident"))
few_fire_df.show(5, truncate=False)

+--------------+----------------------+----------------+
|IncidentNumber|AvailableDtTm         |CallType        |
+--------------+----------------------+----------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire  |
|2003241       |01/11/2002 03:01:18 AM|Medical Incident|
|2003242       |01/11/2002 02:39:50 AM|Medical Incident|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire    |
|2003259       |01/11/2002 06:01:58 AM|Alarms          |
+--------------+----------------------+----------------+
only showing top 5 rows



In [10]:
(fire_df
    .select("CallType")
    .where(col("CallType").isNotNull())
    .agg(countDistinct("CallType").alias('DistinctCallType'))
    .show())

+----------------+
|DistinctCallType|
+----------------+
|              30|
+----------------+



In [11]:
(fire_df
    .select("CallType")
    .where(col("CallType").isNotNull())
    .distinct()
    .show(30, False))

+--------------------------------------------+
|CallType                                    |
+--------------------------------------------+
|Elevator / Escalator Rescue                 |
|Marine Fire                                 |
|Aircraft Emergency                          |
|Confined Space / Structure Collapse         |
|Administrative                              |
|Alarms                                      |
|Odor (Strange / Unknown)                    |
|Citizen Assist / Service Call               |
|HazMat                                      |
|Watercraft in Distress                      |
|Explosion                                   |
|Oil Spill                                   |
|Vehicle Fire                                |
|Suspicious Package                          |
|Extrication / Entrapped (Machinery, Vehicle)|
|Other                                       |
|Outside Fire                                |
|Traffic Collision                           |
|Assist Polic

In [12]:
new_df = fire_df.withColumnRenamed('Delay', 'ResponseDelayedMins')
(new_df.select('ResponseDelayedMins').where(col('ResponseDelayedMins')>6).show())

+-------------------+
|ResponseDelayedMins|
+-------------------+
|               6.25|
|               7.25|
|          11.916667|
|           8.633333|
|           95.28333|
|                7.6|
|           6.133333|
|          6.9166665|
|               6.35|
|           7.983333|
|              13.55|
|          13.583333|
|          6.5333333|
|               8.15|
|                6.6|
|          7.0666666|
|               13.4|
|           8.716666|
|          7.9333334|
|          6.4666667|
+-------------------+
only showing top 20 rows



In [13]:
#Convert Datetime
fire_ts_df = (new_df.withColumn('IncidentDate', to_timestamp(col('CallDate'), "MM/dd/yyyy"))
             .drop('CallDate')
             .withColumn('OnWatchDate', to_timestamp(col('WatchDate'), "MM/dd/yyyy"))
             .drop('WatchDate')
             .withColumn('AvailableDtTs', to_timestamp(col('AvailableDtTm'), "MM/dd/yyyy hh:mm:ss a"))
             .drop('AvailableDtTm'))

In [14]:
(fire_ts_df
.select("IncidentDate", "OnWatchDate", "AvailableDtTS")
.show(5, False))

+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



In [15]:
(fire_ts_df.select(year('IncidentDate')).distinct().orderBy(year('IncidentDate')).show())

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



In [16]:
import pyspark.sql.functions as F

In [17]:
(fire_ts_df
    .select(F.sum("NumAlarms"), F.avg("ResponseDelayedMins"), F.min("ResponseDelayedMins"), F.max("ResponseDelayedMins"))
    .show())

+--------------+------------------------+------------------------+------------------------+
|sum(NumAlarms)|avg(ResponseDelayedMins)|min(ResponseDelayedMins)|max(ResponseDelayedMins)|
+--------------+------------------------+------------------------+------------------------+
|        176170|       3.892364154521585|             0.016666668|                 1844.55|
+--------------+------------------------+------------------------+------------------------+



In [18]:
fire_ts_df.write.csv('statistic.csv')

### Aggregation

In [50]:
#What were most commom type of fire call?
most_type_call_fire=(fire_ts_df
    .select('CallType')
    .where(col('CallType').isNotNull())
    .groupBy("CallType")
    .count()
    .orderBy('count', ascending=False)
    )

In [52]:
most_type_call_fire.count()

30

In [20]:
#what zip codes accounted for the most calls
most_zipcode = (fire_ts_df
               .select('Zipcode')
               .where(col('Zipcode').isNotNull())
                .groupBy('Zipcode')
               .count()
               .orderBy('count', ascending=False))

In [21]:
most_zipcode.coalesce(1).write.format('json').save('zipcode')

In [24]:
#What were all the different types of fire calls in 2018?
(fire_ts_df
    .select(col('CallType'))
    .where(year('IncidentDate')==2018)
    .distinct()
    .show(truncate=False))

+-------------------------------+
|CallType                       |
+-------------------------------+
|Elevator / Escalator Rescue    |
|Alarms                         |
|Odor (Strange / Unknown)       |
|Citizen Assist / Service Call  |
|HazMat                         |
|Explosion                      |
|Vehicle Fire                   |
|Suspicious Package             |
|Other                          |
|Outside Fire                   |
|Traffic Collision              |
|Assist Police                  |
|Gas Leak (Natural and LP Gases)|
|Water Rescue                   |
|Electrical Hazard              |
|Structure Fire                 |
|Medical Incident               |
|Fuel Spill                     |
|Smoke Investigation (Outside)  |
|Train / Rail Incident          |
+-------------------------------+



In [39]:
fire_df_dt.select('CallNumber','CallType').where(year("IncidentDate")==2018).orderBy("CallType",ascending=False).show(10, truncate=False)

+----------+------------+
|CallNumber|CallType    |
+----------+------------+
|181023000 |Water Rescue|
|182811099 |Water Rescue|
|181111931 |Water Rescue|
|180820882 |Water Rescue|
|181111931 |Water Rescue|
|180883627 |Water Rescue|
|181122436 |Water Rescue|
|180992355 |Water Rescue|
|181250542 |Water Rescue|
|180841192 |Water Rescue|
+----------+------------+
only showing top 10 rows



In [27]:
#What months within the year 2018 saw the highest number of fire calls?
(fire_ts_df
    .select(month('IncidentDate'),'CallNumber', 'IncidentDate')
     .where(year('IncidentDate')==2018)
    .groupBy(month('IncidentDate'))
    .agg(count("CallNumber").alias("Total"))
    .orderBy('Total', ascending=False)
    .show())

+-------------------+-----+
|month(IncidentDate)|Total|
+-------------------+-----+
|                 10| 1068|
|                  5| 1047|
|                  3| 1029|
|                  8| 1021|
|                  1| 1007|
|                  7|  974|
|                  6|  974|
|                  9|  951|
|                  4|  947|
|                  2|  919|
|                 11|  199|
+-------------------+-----+



In [29]:
#Which neighborhood in San Francisco generated the most fire calls in 2018?
(fire_ts_df
    .select('Neighborhood', 'IncidentDate')
    .filter(year('IncidentDate')==2018)
    .groupBy('Neighborhood')
    .count()
    .orderBy('count', ascending=False)
    .show(10,truncate =False))

+------------------------------+-----+
|Neighborhood                  |count|
+------------------------------+-----+
|Tenderloin                    |1393 |
|South of Market               |1053 |
|Mission                       |913  |
|Financial District/South Beach|772  |
|Bayview Hunters Point         |522  |
|Western Addition              |352  |
|Sunset/Parkside               |346  |
|Nob Hill                      |295  |
|Hayes Valley                  |291  |
|Outer Richmond                |262  |
+------------------------------+-----+
only showing top 10 rows



In [40]:
#Which neighborhoods had the worst response times to fire calls in 2018?
(fire_ts_df
    .select('Neighborhood', 'ResponseDelayedMins')
    .where(year('IncidentDate')==2018)
    .orderBy('ResponseDelayedMins', ascending=False)
    .show(10,truncate =False))

+------------------------------+-------------------+
|Neighborhood                  |ResponseDelayedMins|
+------------------------------+-------------------+
|Chinatown                     |491.26666          |
|Financial District/South Beach|406.63333          |
|Tenderloin                    |340.48334          |
|Haight Ashbury                |175.86667          |
|Bayview Hunters Point         |155.8              |
|Financial District/South Beach|135.51666          |
|Pacific Heights               |129.01666          |
|Potrero Hill                  |109.8              |
|Inner Sunset                  |106.13333          |
|South of Market               |94.71667           |
+------------------------------+-------------------+
only showing top 10 rows



In [46]:
#Which week in the year in 2018 had the most fire calls?
(fire_ts_df.select('IncidentDate', 'CallNumber')
    .where(year('IncidentDate')==2018)
    .groupBy(weekofyear('IncidentDate'))
     .count()
     .orderBy('count', ascending=False)
    .show())

+------------------------+-----+
|weekofyear(IncidentDate)|count|
+------------------------+-----+
|                      22|  259|
|                      40|  255|
|                      43|  250|
|                      25|  249|
|                       1|  246|
|                      44|  244|
|                      13|  243|
|                      32|  243|
|                      11|  240|
|                       5|  236|
|                      18|  236|
|                      23|  235|
|                      42|  234|
|                      31|  234|
|                       2|  234|
|                      19|  233|
|                      34|  232|
|                      10|  232|
|                       8|  232|
|                      28|  231|
+------------------------+-----+
only showing top 20 rows



In [60]:
number_of_firecall = (fire_ts_df.select('IncidentDate')
                         .groupBy(year('IncidentDate'))
                         .count()
                         .orderBy(year('IncidentDate'))
                        )