In [1]:
import requests
from requests import Session
import os
import pandas as pd
from pandas import DataFrame
import sqlite3
from typing import Tuple
from geopy.distance import geodesic
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Geocode addresses from NY incidents

The goal of this notebook is to link HUD REAC inspection addresses with FEMA NFIRS fire incident addresses.

The data from NFIRS doesn't have longitude and latitude, so we'll add this data using the Census Bureau API so that we can compare the geographic locations of the two sets of addresses.

Because we're starting by exploring data in NY in 2015-2019, this notebook will geo-code addresses only for those incidents that occurred in NY during that time frame.

In [2]:
WORKING_DIR = 'D:/Fire Project/data/'
NFIRS_PATHS = ['nfirs_fire_hazmat_pdr_2020/NFIRS_FIRES_2020_022322',
                 'USFA NFIRS 2019 Hazmat/NFIRS_FIRES_2019_011921',
                 'USFA NFIRS 2018 Hazmat/NFIRS_FIRES_2018_110119',
                 'USFA NFIRS 2017 Hazmat/NFIRS_FIRES_2017_020719',
                 'USFA NFIRS 2016 Hazmat/NFIRS_FIRES_2016_02-05-2018',
                 'USFA NFIRS 2015 Hazmat/NFIRS_FIRES_2015_20170215']

This function gets standard, geocoded information about matching addresses from the U.S. Census Bureau API.

In [3]:
def code_address(address: Tuple[str], session: Session) -> dict:
    street = address[0].upper().strip()
    city = address[1].upper().strip()
    state = address[2].upper().strip()
    zipcode = str(address[3]).strip()
    
    BENCHMARK = "Public_AR_Current"
    VINTAGE = "Current_Current"
    OUTPUT_FORMAT = "json"

    # URL encode the address components and construct the API request URL
    URL = f"https://geocoding.geo.census.gov/geocoder/locations/onelineaddress?address={street.replace(' ', '+')},+{city.replace(' ', '+')}%2C+{state}+{zipcode}&benchmark={BENCHMARK}&vintage={VINTAGE}&format={OUTPUT_FORMAT}"

    response = session.get(URL)
    if response.status_code == 200:
        return response.json()
    else:
        print("Request failed")
        return None

We'll start by creating a table to store the latitude and longitude data and connecting it to our existing list of addresses.

In [4]:
conn = sqlite3.Connection('fire_data_copy.db')
cur = conn.cursor()
cur.execute("""
    CREATE TABLE IF NOT EXISTS address_geocoded (
        id INTEGER PRIMARY KEY,
        latitude REAL,
        longitude REAL,
        INCIDENT_KEY TEXT,
        FOREIGN KEY (INCIDENT_KEY)
            REFERENCES incident_address(INCIDENT_KEY)
    )
""")
conn.commit()

We also need to define a function to insert the coordinate data into a SQL table once we have the information from the API.

In [5]:
def insert_coordinates(latitude: float, longitude: float, incident_key: str, conn) -> None:
    cur = conn.cursor()
    cur.execute("""INSERT INTO address_geocoded (
                            latitude, 
                            longitude, 
                            INCIDENT_KEY) 
                        VALUES (
                            ?, ?, ?
                        )
    """, (latitude, longitude, incident_key))
    conn.commit()

We'll get a list of address information from our database which we'll use to query the API and get the cooresponding sets of coordinates.

In case we need to start or stop this process, we'll only get addresses that don't already have an associated longitude/latitude in our local db file.

In [6]:
conn = sqlite3.Connection('fire_data_copy.db')
cur = conn.cursor()

cur.execute("""
WITH complete AS (
    SELECT INCIDENT_KEY as key
    FROM address_geocoded)
SELECT ia.INCIDENT_KEY,
    COALESCE(NUM_MILE, '') || ' ' ||
    COALESCE(STREET_PRE, '') || ' ' ||
    COALESCE(STREETNAME, '') || ' ' ||
    COALESCE(STREETTYPE, '') || ' ' ||
    COALESCE(STREETSUF, '') || ' ' ||
    COALESCE(APT_NO, '') as street,
    CITY as city,
    STATE as state,
    ZIP5 as zipcode
FROM incident_address as ia
WHERE ia.INCIDENT_KEY NOT IN (SELECT key FROM complete)
    AND STATE = 'NY'
ORDER BY RANDOM()
""")
ny_addresses = cur.fetchall()

Now we'll go address-by-address returned by the SELECT query and get the coordinates from the API.

In [7]:
session = requests.Session()
count = 0
try:
    for row in ny_addresses:
        incident_key = row[0]
        address = row[1:]
        census_result = code_address(address, session)

        if census_result:
            matches = census_result['result']['addressMatches']
            if matches:
                # Keep only the first match
                matches = matches[0]
                coordinates = matches['coordinates']
                insert_coordinates(coordinates['y'], coordinates['x'], incident_key, conn)
                count += 1
                if count % 1000 == 0:
                    print(count, time.time())
finally:
    session.close()

1000 1682285622.8362489
2000 1682286067.8882256


JSONDecodeError: Expecting value: line 1 column 1 (char 0)