In [1]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import time
import requests
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)

Start with Philadelphia, public housing, and all fire incidents 2021.

In [2]:
# 14 minutes to run
conn = sqlite3.Connection('./data/nfirs/fire_data.db')
df = pd.read_sql("""
    select 
        INC_TYPE, 
        NUM_MILE,
        STREET_PRE,
        STREETNAME,
        STREETTYPE,
        STREETSUF,
        APT_NO,
        ia.STATE,
        ia.CITY,
        ia.ZIP5,
        bi.PROP_LOSS,
        bi.CONT_LOSS,
        bi.OTH_DEATH,
        bi.OTH_INJ,
        bi.FF_DEATH,
        bi.FF_INJ,
        cast(substr(ia.INC_DATE, length(ia.INC_DATE) - 3, 4) AS integer) as inc_year
    from basic_incident bi
        join incident_address ia
        using (INCIDENT_KEY)
    where cast(substr(bi.INC_DATE, length(bi.INC_DATE) - 3, 4) AS integer) > 2014
        and cast(substr(ia.INC_DATE, length(ia.INC_DATE) - 3, 4) AS integer) > 2014
        and (bi.INC_TYPE = 111 or bi.INC_TYPE = 113 or bi.INC_TYPE = 114 or 
                 bi.INC_TYPE = 115 or bi.INC_TYPE = 116 or bi.INC_TYPE = 118)
        
""", conn)

In [3]:
df.columns = [x.lower() for x in df.columns]

In [4]:
# 21 minutes to run
df['address'] = df[['num_mile', 'street_pre', 'streetname', 'streettype', 'streetsuf', 'apt_no']] \
    .apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

In [5]:
df.state.nunique()

52

In [6]:
df.groupby('inc_type').size().sort_values(ascending=False)

inc_type
111    1905872
113     808354
118     269018
114     107681
116      48442
115       6154
dtype: int64

In [7]:
df.groupby('inc_year').size().sort_values(ascending=False)

inc_year
2018    465206
2019    465096
2016    461118
2017    455779
2015    453880
2020    423229
2021    421213
dtype: int64

In [8]:
df.address = df.address.str.upper()

In [9]:
public = pd.read_csv('./data/Public_Housing_Buildings.csv')
public.columns = [x.lower() for x in public.columns]

  public = pd.read_csv('./data/Public_Housing_Buildings.csv')


In [10]:
public['std_addr'] = public.std_addr.str.upper()
public['std_st'] = public.std_st.str.upper()
public['std_city'] = public.std_city.str.upper()

In [11]:
# For the 2015-2019 data
fire_specific15 = df[(df['inc_type'].isin([111, 113])) & (df.inc_year.astype(int) < 2020)] \
    .groupby(['address', 'city', 'state', 'zip5']) \
    .size().reset_index()
fire_specific15.columns = ['address', 'city', 'state', 'zip5', 'building_fires_11_19']
fire_specific15 = fire_specific15.fillna(0) 
public = public.merge(fire_specific15, 
    left_on=['std_addr', 'std_city', 'std_st', 'std_zip5'], 
    right_on=['address', 'city', 'state', 'zip5'], how='left') \
    .drop(['address', 'city', 'state', 'zip5'], axis=1)
public[['building_fires_11_19']] = public[['building_fires_11_19']].fillna(0).astype(int)

# For the 2020-2021 validation data
fire_specific20 = df[(df['inc_type'].isin([111, 113])) & (df.inc_year.astype(int) >= 2020)] \
    .groupby(['address', 'city', 'state', 'zip5']) \
    .size().reset_index()
fire_specific20.columns = ['address', 'city', 'state', 'zip5', 'building_fires_20_21']
fire_specific20 = fire_specific20.fillna(0) 
public = public.merge(fire_specific20, 
    left_on=['std_addr', 'std_city', 'std_st', 'std_zip5'], 
    right_on=['address', 'city', 'state', 'zip5'], how='left') \
    .drop(['address', 'city', 'state', 'zip5'], axis=1)
public[['building_fires_20_21']] = public[['building_fires_20_21']].fillna(0).astype(int)

In [12]:
len(public[(public['building_fires_11_19'] > 0) | (public['building_fires_20_21'] > 0)])

4387

In [13]:
public[((public['building_fires_11_19'] > 0) | (public['building_fires_20_21'] > 0)) & 
   (public.duplicated(subset=['std_addr', 'std_city', 'std_st', 'std_zip5'], keep=False))] \
   .std_addr.nunique()

171

171 of the identified street addresses are duplicates. In each case, we'll pick a random building and zero out the rest.

In [14]:
df_random_order = public.sample(frac=1).reset_index(drop=True)

In [15]:
df_random_order_no_fires = df_random_order[
    (df_random_order['building_fires_11_19'] == 0) & 
    (df_random_order['building_fires_20_21'] == 0)
]

df_random_order_some_fires = df_random_order[
    (df_random_order['building_fires_11_19'] > 0) | 
    (df_random_order['building_fires_20_21'] > 0)
]

is_duplicate = df_random_order_some_fires.duplicated(subset=[
    'std_addr', 
    'std_city', 
    'std_st', 
    'std_zip5'
], keep='first')

df_random_order_some_fires.loc[is_duplicate, 'building_fires_11_19'] = 0
df_random_order_some_fires.loc[is_duplicate, 'building_fires_20_21'] = 0

public = pd.concat([df_random_order_some_fires, df_random_order_no_fires], axis=0)

In [16]:
len(public[(public['building_fires_11_19'] > 0) | (public['building_fires_20_21'] > 0)])

2896

In [17]:
public[((public['building_fires_11_19'] > 0) | (public['building_fires_20_21'] > 0)) & 
   (public.duplicated(subset=['std_addr', 'std_city', 'std_st', 'std_zip5'], keep=False))] \
   .std_addr.nunique()

171

In [18]:
df = df.replace(-4, np.nan)

In [19]:
public.to_csv('fires_in_ph.csv', index=False)