In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sqlite3
import re

pd.set_option('display.max_columns', None)

In [2]:
WORKING_DIRECTORY = '../data/processed/'

In [3]:
conn = sqlite3.Connection(WORKING_DIRECTORY + 'fire_data.db')

# Basic cleaning and validation

Author: Jack Vandeleuv

In this notebook, we'll get basic summary stats about fires for U.S. cities based on the NFIRS data.

We'll start with these columns:
* EXP_NO (exposure number): the number of other structures/vehicles to which the fire spreads. By calculating the average exposure number by city, we can see which cities are most likely to see fires spread.
* PROP_LOSS and CONT_LOSS: the dollar values of lost property and lost contents (anything inside a burning structure/vehicle) respectively. There is also PROP_VAL and CONT_VAL, but these measure the pre-fire value and so we'll ignore them.
* OTH_DEATH, OTH_INJ, FF_DEATH, OTH_INJ: measures deaths and injuries for non-firefighters and firefighters respectively.

Before we query values from the table, let's look at percentage of null values per column.

In [4]:
# # 15 seconds to run
# pd.options.display.float_format = '{:.6f}'.format
# pd.read_sql("""
#     SELECT
#         (CAST(COUNT(*) AS FLOAT) - COUNT(INCIDENT_KEY)) / COUNT(*) AS INCIDENT_KEY_NULL,
#         (CAST(COUNT(*) AS FLOAT) - COUNT(INC_DATE)) / COUNT(*) AS INC_DATE_NULL,
#         (CAST(COUNT(*) AS FLOAT) - COUNT(EXP_NO)) / COUNT(*) AS EXP_NO_NULL,
#         (CAST(COUNT(*) AS FLOAT) - COUNT(ALARMS)) / COUNT(*) AS ALARMS_NULL,
#         (CAST(COUNT(*) AS FLOAT) - COUNT(PROP_LOSS)) / COUNT(*) AS PROP_LOSS_NULL,
#         (CAST(COUNT(*) AS FLOAT) - COUNT(CONT_LOSS)) / COUNT(*) AS CONT_LOSS_NULL,
#         (CAST(COUNT(*) AS FLOAT) - COUNT(OTH_DEATH)) / COUNT(*) AS OTH_DEATH_NULL,
#         (CAST(COUNT(*) AS FLOAT) - COUNT(FF_DEATH)) / COUNT(*) AS FF_DEATH_NULL,
#         (CAST(COUNT(*) AS FLOAT) - COUNT(OTH_INJ)) / COUNT(*) AS OTH_INJ_NULL,
#         (CAST(COUNT(*) AS FLOAT) - COUNT(FF_INJ)) / COUNT(*) AS FF_INJ_NULL
#     FROM basic_incident;
# """, conn)

We have significant numbers of null values for:
* ALARMS
* PROPERTY LOSS AND CONTENTS LOSS
* INJURY AND DEATH (FIREFIGHTER AND NON-FIREFIGHTER)

Do a basic sanity check: find the total number of deaths there would be, assuming that the average can be calculated by disregarding the null values. (In reality, about 3,000 Americans die in fires every year.)

In [5]:
# # 45 seconds to run
# pd.options.display.float_format = '{:.1f}'.format
# pd.read_sql("""
#     SELECT
#         CAST(SUBSTR(INC_DATE, LENGTH(INC_DATE) - 3, 4) AS INTEGER) AS year,
#         AVG(FF_DEATH) * COUNT(*) AS ff_death_no_imputation,
#         SUM(IFNULL(FF_DEATH, 0)) AS ff_death_impute_zero,
#         AVG(OTH_DEATH) * COUNT(*) AS other_death_no_imputation,
#         SUM(IFNULL(OTH_DEATH, 0)) AS other_death_impute_zero
#     FROM basic_incident
#     GROUP BY CAST(SUBSTR(INC_DATE, LENGTH(INC_DATE) - 3, 4) AS INTEGER)
# ;""", conn)

~500K/year is much too high for fire deaths! So imputation is likely the correct approach for fatalities. There is less effect on firefighter deaths, as these values are mostly not null.

We'll do a similar validation with property (and contents of buildings) lost to fire, expressed in dollars.

In [6]:
# # 1.5 minutes to run
# pd.options.display.float_format = '${:,.2f}'.format
# loss = pd.read_sql("""
#     SELECT
#         CAST(SUBSTR(INC_DATE, LENGTH(INC_DATE) - 3, 4) AS INTEGER) AS year,
#         AVG(PROP_LOSS) * COUNT(*) AS property_loss_no_imputation,
#         SUM(IFNULL(PROP_LOSS, 0)) AS property_loss_impute_zero,
#         AVG(CONT_LOSS) * COUNT(*) AS contents_loss_no_imputation,
#         SUM(IFNULL(CONT_LOSS, 0)) AS contents_loss_impute_zero
#     FROM basic_incident
#     GROUP BY CAST(SUBSTR(INC_DATE, LENGTH(INC_DATE) - 3, 4) AS INTEGER)
# ;""", conn)

# display(loss)
# pd.reset_option('display.float_format')

This validation is less clear-cut, with the property loss being about twice as high without imputation.

Later on, we will impute these as 0 to avoid inflating the average values. 

# Calculate average death, injury, property loss, and fire spread by city

We'll impute 0 when death/injury, property/contents loss, or alarms is null.

We'll calcuate the average exposure number per city. Each fire (including exposure fires resulting from the first fire), shares a STATE/FDID/INC_DATE/INC_NO. (This is the INCIDENT_KEY minus the EXP_NO, or exposure number, which is the final part of the 5-part key.)

By taking the maximum exposure number in the subquery, we'll get a count of the number of exposures per fire.

In [7]:
# 5 minutes
avg_exp = pd.read_sql("""
    WITH sub AS (
        SELECT 
            ia.CITY,
            ia.STATE, 
            ia.FDID, 
            ia.INC_DATE, 
            ia.INC_NO,
            MAX(ia.EXP_NO) as max
        FROM incident_address ia
            JOIN basic_incident bi
            USING (INCIDENT_KEY)
        WHERE bi.INC_TYPE < 200 
        GROUP BY ia.CITY, ia.STATE, ia.FDID, ia.INC_DATE, ia.INC_NO
    )
    SELECT 
        sub.CITY || ',' || sub.STATE AS CITYSTATE,
        SUM(sub.max) AS SUM_SPREAD,
        COUNT(*) AS SPREAD_SUPPORT,
        CAST(SUBSTR(sub.INC_DATE, LENGTH(sub.INC_DATE) - 3, 4) AS INTEGER) as YEAR 
    FROM sub
    GROUP BY 
        sub.CITY, 
        sub.STATE, 
        CAST(SUBSTR(sub.INC_DATE, LENGTH(sub.INC_DATE) - 3, 4) AS INTEGER)
""", conn)

It's important to consider that factors like property loss may be reported separately for different sub-fires. E.g., if an initial fire causes two other fires, each of the three fires might have its own associated property loss, which will be reported under a separate INCIDENT_KEY. 

We can aggregate the death, injury, and property/container loss by grouping by STATE, FDID, INC_DATE, and INC_NO. 

(INCIDENT_KEY, which uniquely identifies a fire, is a composite string formatted like this => STATE + FDID + INC_DATE + INC_NO + EXP_NO.)

Selecting all rows in incident address where exposure number is zero will give us all reported primary fires (i.e. fires not as the result of an exposure from another fire) where the address is known.

In [8]:
# 5 minutes to run
avg_loss = pd.read_sql("""
    SELECT
        ia.CITY || ',' || ia.STATE AS CITYSTATE,
        SUM(COALESCE(bi.PROP_LOSS, 0)) as SUM_PROP_LOSS,
        SUM(COALESCE(bi.CONT_LOSS, 0)) as SUM_CONT_LOSS,
        SUM(COALESCE(bi.OTH_DEATH, 0)) as SUM_OTH_DEATH,
        SUM(COALESCE(bi.FF_DEATH, 0)) as SUM_FF_DEATH,
        SUM(COALESCE(bi.OTH_INJ, 0)) as SUM_OTH_INJ,
        SUM(COALESCE(bi.FF_INJ, 0)) as SUM_FF_INJ,
        COUNT(*) AS LOSS_SUPPORT,
        CAST(SUBSTR(ia.INC_DATE, LENGTH(ia.INC_DATE) - 3, 4) AS INTEGER) as YEAR
    FROM basic_incident bi 
        JOIN incident_address ia
        USING (INCIDENT_KEY)
    WHERE bi.INC_TYPE < 200 
    GROUP BY 
        ia.CITY, 
        ia.STATE, 
        CAST(SUBSTR(ia.INC_DATE, LENGTH(ia.INC_DATE) - 3, 4) AS INTEGER)
""", conn)

We'll export the results for further analysis.

In [10]:
avg_loss.to_csv(
    WORKING_DIRECTORY + 'nfirs_loss.csv',
    sep=',',
    index=False
)

avg_exp.to_csv(
    WORKING_DIRECTORY + 'nfirs_spread.csv',
    sep=',',
    index=False
)