In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sqlite3
import re

pd.set_option('display.max_columns', None)

In [57]:
WORKING_DIRECTORY = '../data/processed/'

In [58]:
conn = sqlite3.Connection(WORKING_DIRECTORY + 'fire_data.db')

# Basic cleaning and validation

In this notebook, we'll get basic summary stats for our cities based on the NFIRS data.

We'll start with these columns:
* EXP_NO (exposure number): the number of other structures/vehicles to which the fire spreads. By calculating the average exposure number by city, we can see which cities are most likely to see fires spread.
* PROP_LOSS and CONT_LOSS: the dollar values of lost property and lost contents (anything inside a burning structure/vehicle) respectively. There is also PROP_VAL and CONT_VAL, but these measure the pre-fire value and so we'll ignore them.
* OTH_DEATH, OTH_INJ, FF_DEATH, OTH_INJ: measures deaths and injuries for non-firefighters and firefighters respectively.

Note, there are a significant number of fires in basic_incident that have no cooresponding incident address. We will exclude these fires from our analysis, as we can't group them by city.

In [59]:

oneway = pd.read_sql("""
    SELECT COUNT(DISTINCT INCIDENT_KEY) 
    FROM basic_incident
    UNION ALL
    SELECT COUNT(*) 
    FROM incident_address;
""", conn)

theother = pd.read_sql("""
    SELECT COUNT(DISTINCT INCIDENT_KEY) 
    FROM incident_address
    UNION ALL
    SELECT COUNT(*) 
    FROM basic_incident;
""", conn)

display(oneway, theother)

Unnamed: 0,COUNT(DISTINCT INCIDENT_KEY)
0,17418755
1,17418758


Unnamed: 0,COUNT(DISTINCT INCIDENT_KEY)
0,17418755
1,17418758


Lets investigate a sample of fires with more than zero exposures to see what the data looks like.

In [60]:

# 30 seconds
test2 = pd.read_sql("""
SELECT basic_incident.* 
FROM basic_incident
LEFT JOIN incident_address 
ON basic_incident.INCIDENT_KEY = incident_address.INCIDENT_KEY
WHERE incident_address.INCIDENT_KEY IS NULL
LIMIT 500000

""", conn)
test2

Unnamed: 0,ID,STATE,FDID,INC_DATE,INC_NO,EXP_NO,VERSION,DEPT_STA,INC_TYPE,ADD_WILD,AID,ALARM,ARRIVAL,INC_CONT,LU_CLEAR,SHIFT,ALARMS,DISTRICT,ACT_TAK1,ACT_TAK2,ACT_TAK3,APP_MOD,SUP_APP,EMS_APP,OTH_APP,SUP_PER,EMS_PER,OTH_PER,RESOU_AID,PROP_LOSS,CONT_LOSS,PROP_VAL,CONT_VAL,FF_DEATH,OTH_DEATH,FF_INJ,OTH_INJ,DET_ALERT,HAZ_REL,MIXED_USE,PROP_USE,CENSUS,INCIDENT_KEY


In [61]:
# group = test.groupby(by=['STATE', 'FDID', 'INC_DATE', 'INC_NO']).EXP_NO.max().reset_index()
# 

In [62]:
# group2 = test2.groupby(by=['STATE', 'FDID', 'INC_DATE', 'INC_NO']).EXP_NO.max().reset_index()


In [63]:
# group.EXP_NO.mean()

In [64]:
# test2.groupby(by=['STATE','FDID','INC_DATE','INC_NO']).EXP_NO.max().reset_index().EXP_NO.mean()

Before we query values from the table, let's look at percentage of null values per column.

In [65]:
# 15 seconds to run
pd.options.display.float_format = '{:.6f}'.format
pd.read_sql("""
    SELECT
        (CAST(COUNT(*) AS FLOAT) - COUNT(INCIDENT_KEY)) / COUNT(*) AS INCIDENT_KEY_NULL,
        (CAST(COUNT(*) AS FLOAT) - COUNT(INC_DATE)) / COUNT(*) AS INC_DATE_NULL,
        (CAST(COUNT(*) AS FLOAT) - COUNT(EXP_NO)) / COUNT(*) AS EXP_NO_NULL,
        (CAST(COUNT(*) AS FLOAT) - COUNT(ALARMS)) / COUNT(*) AS ALARMS_NULL,
        (CAST(COUNT(*) AS FLOAT) - COUNT(PROP_LOSS)) / COUNT(*) AS PROP_LOSS_NULL,
        (CAST(COUNT(*) AS FLOAT) - COUNT(CONT_LOSS)) / COUNT(*) AS CONT_LOSS_NULL,
        (CAST(COUNT(*) AS FLOAT) - COUNT(OTH_DEATH)) / COUNT(*) AS OTH_DEATH_NULL,
        (CAST(COUNT(*) AS FLOAT) - COUNT(FF_DEATH)) / COUNT(*) AS FF_DEATH_NULL,
        (CAST(COUNT(*) AS FLOAT) - COUNT(OTH_INJ)) / COUNT(*) AS OTH_INJ_NULL,
        (CAST(COUNT(*) AS FLOAT) - COUNT(FF_INJ)) / COUNT(*) AS FF_INJ_NULL
    FROM basic_incident;
""", conn)

Unnamed: 0,INCIDENT_KEY_NULL,INC_DATE_NULL,EXP_NO_NULL,ALARMS_NULL,PROP_LOSS_NULL,CONT_LOSS_NULL,OTH_DEATH_NULL,FF_DEATH_NULL,OTH_INJ_NULL,FF_INJ_NULL
0,0.0,0.0,0.0,0.428669,0.407289,0.411452,0.996123,0.010418,0.996123,0.010418


We have significant numbers of null values for:
* ALARMS
* PROPERTY LOSS AND CONTENTS LOSS
* INJURY AND DEATH (FIREFIGHTER AND NON-FIREFIGHTER)

Do a basic sanity check: find the total number of deaths there would be, assuming that the average can be calculated by disregarding the null values. (In reality, about 3,000 Americans die in fires every year.)

In [66]:
# 45 seconds to run
pd.options.display.float_format = '{:.1f}'.format
pd.read_sql("""
    SELECT
        CAST(SUBSTR(INC_DATE, LENGTH(INC_DATE) - 3, 4) AS INTEGER) AS year,
        AVG(FF_DEATH) * COUNT(*) AS firefighter_no_imputation,
        SUM(IFNULL(FF_DEATH, 0)) AS firefighter_impute_zero,
        AVG(OTH_DEATH) * COUNT(*) AS other_no_imputation,
        SUM(IFNULL(OTH_DEATH, 0)) AS other_impute_zero
    FROM basic_incident
    GROUP BY CAST(SUBSTR(INC_DATE, LENGTH(INC_DATE) - 3, 4) AS INTEGER)
;""", conn)

Unnamed: 0,year,firefighter_no_imputation,firefighter_impute_zero,other_no_imputation,other_impute_zero
0,2013,23.3,23.0,457713.0,1944.0
1,2014,21.3,21.0,520773.3,2151.0
2,2015,18.2,18.0,537176.9,2174.0
3,2016,11.1,11.0,613173.4,2390.0
4,2017,23.2,23.0,600813.1,2228.0
5,2018,25.3,25.0,612065.0,2355.0
6,2019,9.1,9.0,621286.0,2453.0
7,2020,25.2,25.0,663308.3,2146.0


~500K/year is much too high for fire deaths! So imputation is likely the correct approach for fatalities. There is less effect on firefighter deaths, as these values are mostly not null.

We'll do a similar validation with property (and contents of buildings) lost to fire, expressed in dollars.

In [67]:
# 1.5 minutes to run
pd.options.display.float_format = '${:,.2f}'.format
loss = pd.read_sql("""
    SELECT
        CAST(SUBSTR(INC_DATE, LENGTH(INC_DATE) - 3, 4) AS INTEGER) AS year,
        AVG(PROP_LOSS) * COUNT(*) AS property_loss_no_imputation,
        SUM(IFNULL(PROP_LOSS, 0)) AS property_loss_impute_zero,
        AVG(CONT_LOSS) * COUNT(*) AS contents_loss_no_imputation,
        SUM(IFNULL(CONT_LOSS, 0)) AS contents_loss_impute_zero
    FROM basic_incident
    GROUP BY CAST(SUBSTR(INC_DATE, LENGTH(INC_DATE) - 3, 4) AS INTEGER)
;""", conn)

display(loss)
pd.reset_option('display.float_format')

Unnamed: 0,year,property_loss_no_imputation,property_loss_impute_zero,contents_loss_no_imputation,contents_loss_impute_zero
0,2013,"$11,107,744,594.51","$6,100,252,912.00","$4,574,669,116.84","$2,482,542,055.00"
1,2014,"$12,836,098,348.78","$7,073,011,307.00","$5,000,377,606.91","$2,724,793,410.00"
2,2015,"$14,209,249,129.14","$7,757,919,406.00","$6,691,443,744.50","$3,621,441,180.00"
3,2016,"$17,430,723,448.98","$10,031,807,333.00","$10,142,315,833.06","$5,803,008,506.00"
4,2017,"$15,428,082,051.38","$9,054,475,107.00","$7,279,604,959.78","$4,253,582,680.00"
5,2018,"$16,596,251,424.50","$10,260,643,777.00","$9,484,419,649.02","$5,837,203,521.00"
6,2019,"$14,459,406,243.80","$9,270,533,265.00","$6,751,338,994.80","$4,303,154,078.00"
7,2020,"$14,640,862,321.28","$9,771,304,548.00","$4,755,840,349.90","$3,156,023,758.00"


This validation is less clear-cut, with the property loss being about twice as high without imputation.

Later on, we will impute these as 0 to avoid inflating the average values. 

# Calculate average death, injury, property loss, and fire spread by city

Because we want to use data from 2020 and later as a validation set, we'll exclude 2020 from the statistics that we calculate.

We'll impute 0 when death/injury, property/contents loss, or alarms is null.

We'll calcuate the average exposure number per city. Each fire (including exposure fires resulting from the first fire), shares a STATE/FDID/INC_DATE/INC_NO. (This is the INCIDENT_KEY minus the EXP_NO, or exposure number, which is the final part of the 5-part key.)

By taking the maximum exposure number in the subquery, we'll get a count of the number of exposures per fire.

In [68]:
# 3 minutes
avg_exp_13_18 = pd.read_sql("""
    WITH sub AS (
        SELECT 
            ia.CITY, 
            ia.STATE, 
            MAX(ia.EXP_NO) as max
        FROM incident_address ia
            JOIN basic_incident bi
            USING (INCIDENT_KEY)
        WHERE CAST(SUBSTR(bi.INC_DATE, LENGTH(bi.INC_DATE) - 3, 4) AS INTEGER) < 2019 
        GROUP BY ia.CITY, ia.STATE, ia.FDID, ia.INC_DATE, ia.INC_NO
    )
    SELECT 
        sub.CITY || ',' || sub.STATE AS CITYSTATE,
        AVG(sub.max) as AVG_SPREAD
    FROM sub
    GROUP BY CITY, STATE 
""", conn)

It's important to consider that factors like property loss may be reported separately for different sub-fires. E.g., if an initial fire causes two other fires, each of the three fires might have its own associated property loss, which will be reported under a separate INCIDENT_KEY. 

We can aggregate the death, injury, and property/container loss by grouping by STATE, FDID, INC_DATE, and INC_NO. 

(INCIDENT_KEY, which uniquely identifies a fire, is a composite string formatted like this => STATE + FDID + INC_DATE + INC_NO + EXP_NO.)

Selecting all rows in incident address where exposure number is zero will give us all reported primary fires (i.e. fires not as the result of an exposure from another fire) where the address is known.

In [69]:
# 3 minutes to run
avg_loss_13_18 = pd.read_sql("""
    WITH sub1 as (
        SELECT
            ia.CITY as CITY,
            ia.STATE as STATE,
            SUM(COALESCE(bi.PROP_LOSS, 0)) as SUM_PROP_LOSS,
            SUM(COALESCE(bi.CONT_LOSS, 0)) as SUM_CONT_LOSS,
            SUM(COALESCE(bi.OTH_DEATH, 0)) as SUM_OTH_DEATH,
            SUM(COALESCE(bi.FF_DEATH, 0)) as SUM_FF_DEATH,
            SUM(COALESCE(bi.OTH_INJ, 0)) as SUM_OTH_INJ,
            SUM(COALESCE(bi.FF_INJ, 0)) as SUM_FF_INJ
        FROM basic_incident bi JOIN incident_address ia
            USING (INCIDENT_KEY)
        WHERE CAST(SUBSTR(bi.INC_DATE, LENGTH(bi.INC_DATE) - 3, 4) AS INTEGER) < 2019
        GROUP BY ia.CITY, ia.STATE
    ),
    sub2 AS (
        SELECT 
            CITY, 
            STATE, 
            COUNT(id) as FIRE_COUNT
        FROM incident_address
        WHERE EXP_NO == 0
        GROUP BY CITY, STATE
    )
    SELECT 
        sub1.CITY || ',' || sub1.STATE AS CITYSTATE, 
        (sub1.SUM_CONT_LOSS + sub1.SUM_PROP_LOSS) / sub2.FIRE_COUNT AS AVG_MONEY_LOST,
        (sub1.SUM_OTH_DEATH + sub1.SUM_FF_DEATH) / sub2.FIRE_COUNT AS AVG_FATALITIES,
        (sub1.SUM_OTH_INJ + sub1.SUM_FF_INJ) / sub2.FIRE_COUNT AS AVG_INJURIES,
        sub2.FIRE_COUNT AS SUPPORT
    FROM sub1 JOIN sub2
        ON sub1.CITY = sub2.CITY
        AND sub1.STATE = sub2.STATE
""", conn)

Now we'll merge the two DataFrames together. 

In [78]:
combined = spread.merge(right=loss, how='inner', on='CITYSTATE')

We'll export the results for further analysis.

In [79]:
combined.to_csv(
    WORKING_DIRECTORY + 'other_nfirs_13_18.csv',
    sep=',',
    index=False
)