In [1]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import time
import requests
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)

Start with Philadelphia, public housing, and all fire incidents 2021.

In [2]:
# 20 minutes to run
conn = sqlite3.Connection('./data/nfirs/fire_data.db')
df = pd.read_sql("""
    select 
        INC_TYPE, 
        NUM_MILE,
        STREET_PRE,
        STREETNAME,
        STREETTYPE,
        STREETSUF,
        APT_NO,
        ia.STATE,
        ia.CITY,
        ia.ZIP5,
        bi.PROP_LOSS,
        bi.CONT_LOSS,
        bi.OTH_DEATH,
        bi.OTH_INJ,
        bi.FF_DEATH,
        bi.FF_INJ,
        cast(substr(ia.INC_DATE, length(ia.INC_DATE) - 3, 4) AS integer) as inc_year
    from basic_incident bi
        join incident_address ia
        using (INCIDENT_KEY)
    where cast(substr(bi.INC_DATE, length(bi.INC_DATE) - 3, 4) AS integer) > 2014
        and cast(substr(ia.INC_DATE, length(ia.INC_DATE) - 3, 4) AS integer) > 2014
        and bi.inc_type <= 200
""", conn)

In [3]:
df.columns = [x.lower() for x in df.columns]

In [5]:
# 21 minutes to run
df['address'] = df[['num_mile', 'street_pre', 'streetname', 'streettype', 'streetsuf', 'apt_no']] \
    .apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

In [8]:
df.state.nunique()

52

In [25]:
df.groupby('inc_type').size().sort_values(ascending=False)

inc_type
111    1905872
113     808354
131     785355
142     649914
151     621113
143     561741
150     289788
100     284786
118     269018
154     221103
140     194341
141     178879
130     120342
114     107681
112     103117
132     102356
160      95225
162      66191
121      60288
138      58921
116      48442
161      29896
122      20147
137      19369
171      16475
170      16410
153      10071
134       9990
123       9290
117       9145
152       8729
173       7296
120       6399
115       6154
163       6026
155       4136
133       3697
136       3322
110       2058
164       2011
135       1667
172       1031
dtype: int64

In [9]:
df.address = df.address.str.upper()

In [10]:
public = pd.read_csv('./data/Public_Housing_Buildings.csv')
public.columns = [x.lower() for x in public.columns]

  public = pd.read_csv('./data/Public_Housing_Buildings.csv')


In [11]:
public['std_addr'] = public.std_addr.str.upper()
public['std_st'] = public.std_st.str.upper()
public['std_city'] = public.std_city.str.upper()

In [27]:
# For the 2015-2019 data
fire_specific15 = df[(df['inc_type'].isin([111, 113])) & (df.inc_year.astype(int) < 2020)] \
    .groupby(['address', 'city', 'state', 'zip5']) \
    .size().reset_index()
fire_specific15.columns = ['address', 'city', 'state', 'zip5', 'house+cooking_fires_15-19']
fire_specific15 = fire_specific15.fillna(0) 
public = public.merge(fire_specific15, 
    left_on=['std_addr', 'std_city', 'std_st', 'std_zip5'], 
    right_on=['address', 'city', 'state', 'zip5'], how='left') \
    .drop(['address', 'city', 'state', 'zip5'], axis=1)
public[['house+cooking_fires_15-19']] = public[['house+cooking_fires_15-19']].fillna(0).astype(int)

# For the 2020-2021 validation data
fire_specific20 = df[(df['inc_type'].isin([111, 113])) & (df.inc_year.astype(int) >= 2020)] \
    .groupby(['address', 'city', 'state', 'zip5']) \
    .size().reset_index()
fire_specific20.columns = ['address', 'city', 'state', 'zip5', 'house+cooking_fires_20-21']
fire_specific20 = fire_specific20.fillna(0) 
public = public.merge(fire_specific20, 
    left_on=['std_addr', 'std_city', 'std_st', 'std_zip5'], 
    right_on=['address', 'city', 'state', 'zip5'], how='left') \
    .drop(['address', 'city', 'state', 'zip5'], axis=1)
public[['house+cooking_fires_20-21']] = public[['house+cooking_fires_20-21']].fillna(0).astype(int)

In [81]:
# from scipy.stats import pearsonr

# corrs = []
# public_pa_ny = public
# # Iterate over numerical columns in pr
# for col in public.select_dtypes(include='number').columns:
#     if col not in ['all_fires', 'block2kx', 'bg2kx', 'place_inc2kx', 
#                    'hlc', 'dpvrc', 'std_zip9', 'dpbc', 'dpbc_cksum', 'std_zip11', 
#                    'c1pprb', 'blkgrp_level', 'exec_dir_fax', 
#                    'national_bldg_id', 'dpvact', 'dpvnost', 'place2kx',
#                    'dpv', 'zcta2kx', 'place_level', 'annl_expns_amnt_prev_yr', 'x', 'y',
#                    'state2kx', 'cnty2kx', 'tract2kx', 'curcnty', 'curcosub', 'msa',
#                    'cbsa', 'necta', 'metro', 'micro', 'lat', 'lon', 'county_level',
#                    'tract_level', 'ha_fax_num', 'exec_dir_phone', 'house+cooking_fires_15-19',
#                     'house+cooking_fires_20-21']:
#         # Get correlation and p-value
#         correlation, p_value = pearsonr(public[col], public['house+cooking_fires_15-19'])
#         corrs.append((col, correlation, p_value))

# corrs_df = pd.DataFrame(corrs, columns=['columns', 'corr', 'p_value'])
# corrs_df['significant'] = corrs_df['p_value'] < 0.05

In [86]:
public.to_csv('fires_in_ph.csv', index=False)