In [1]:
import requests
from requests import Session
import os
import pandas as pd
from pandas import DataFrame
import sqlite3
from typing import Tuple
from geopy.distance import geodesic
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
WORKING_DIR = 'D:/Fire Project/data/'
NFIRS_PATHS = ['nfirs_fire_hazmat_pdr_2020/NFIRS_FIRES_2020_022322',
                 'USFA NFIRS 2019 Hazmat/NFIRS_FIRES_2019_011921',
                 'USFA NFIRS 2018 Hazmat/NFIRS_FIRES_2018_110119',
                 'USFA NFIRS 2017 Hazmat/NFIRS_FIRES_2017_020719',
                 'USFA NFIRS 2016 Hazmat/NFIRS_FIRES_2016_02-05-2018',
                 'USFA NFIRS 2015 Hazmat/NFIRS_FIRES_2015_20170215']

## Find areas with high number of reported fire incidents and REAC inspections

To start our exploration, let's find the zipcodes that are most represented in both the REAC and NFIRS dataset. This can be good jumping off point for comparing the datasets.

We can find the most common zips in NFIRS using a SQL query.

In [3]:
conn = sqlite3.Connection('fire_data.db')
cur = conn.cursor()

zip_counts = pd.read_sql("""
SELECT ZIP5 as zipcode, COUNT(*) as count
FROM incident_address
GROUP BY ZIP5
ORDER BY count DESC
""", conn)

Likewise, we can use pandas to count zipcodes in the REAC data.

In [4]:
multifamily_scores = pd.read_excel(WORKING_DIR + 'multifamily_physical_inspection_scores_0321.xlsx')
public_housing_scores = pd.read_excel(WORKING_DIR + 'public_housing_physical_inspection_scores_0321.xlsx')

In [5]:
multi_family_zips = multifamily_scores.groupby(by='ZIPCODE').size() \
                                                            .sort_values(ascending=False) \
                                                            .reset_index()
public_housing_zips = public_housing_scores.groupby(by='ZIPCODE').size() \
                                                            .sort_values(ascending=False) \
                                                            .reset_index()

Next, we'll go through the NFIRS data points with a high number of fire incidents and compare that to the REAC dataset to see if there are any matching zipcodes. Because we are mostly interested in the top results, we'll start with the top 1000 matches.

In [6]:
zip_matches = []
i = 0
for zipcode, count in zip_counts.values.tolist():
    for zip, num in multi_family_zips.values.tolist():
        if zipcode == zip:
            zip_matches.append((zipcode, count, num))
            i += 1
    if i > 1000:
        break

We'll add these values into a dataframe, and multiply together the two counts to find the zipcodes that have high counts in both datasets.

In [7]:
zip_match_df = pd.DataFrame(zip_matches, columns=['zipcode', 'nfirs_count', 'reac_count'])
zip_match_df['combined'] = zip_match_df['nfirs_count'] * zip_match_df['reac_count']

In [9]:
# Print resulting values
zip_match_df.sort_values(by='combined', ascending=False).head(20)

Unnamed: 0,zipcode,nfirs_count,reac_count,combined
100,60637,3858,33.0,127314.0
210,2119,3187,35.0,111545.0
8,10029,6920,16.0,110720.0
4,10701,7905,14.0,110670.0
477,60653,2525,41.0,103525.0
11,11207,6328,16.0,101248.0
6,11212,7347,13.0,95511.0
2,27405,10597,9.0,95373.0
17,10002,5734,16.0,91744.0
3,3060,8565,10.0,85650.0
