In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import geopandas as gpd
import gzip 

pd.set_option('display.max_columns', None)

WORKING_DIRECTORY = '../data/processed/'

# Merge datasets

In [2]:
PATHS_TO_MERGE = [
    'reac_13-19.csv',
    'top_5_incident_count.csv',
    'total_incident_count.csv',
    'other_nfirs_stats.csv',
    'populations_clean.csv'
]

Load the DataFrames into a list.

In [3]:
dfs = [
    pd.read_csv(WORKING_DIRECTORY + path) \
    for path in PATHS_TO_MERGE
]

Merge the DataFrames based on NFIRS and REAC data.

In [4]:
merged_df = pd.merge(dfs[0], dfs[1], on='CITYSTATE', how='inner')
merged_df = pd.merge(merged_df, dfs[2], on='CITYSTATE', how='inner')
merged_df = pd.merge(merged_df, dfs[3], on='CITYSTATE', how='inner')
merged_df = pd.merge(merged_df, dfs[4], on='CITYSTATE', how='left')

# Clean and format merged dataset

We have significant numbers of duplicates, because the census reports data on locations at different levels of granularity. For each match, we'll keep the one with the highest population.

In [5]:
merged_df[
    merged_df.CITYSTATE.str.contains('NEW YORK') &
    ~merged_df.CITYSTATE.str.contains('MILLS') &
    ~merged_df.CITYSTATE.str.contains('WEST')

    ]

Unnamed: 0,CITYSTATE,LATITUDE,LONGITUDE,AVG_SCORE_MULTIFAMILY,AVG_SCORE_PUBLIC,COUNT_111,COUNT_113,COUNT_131,COUNT_151,COUNT_142,TOTAL_INCIDENT_COUNT,AVG_SPREAD,AVG_MONEY_LOST,AVG_FATALITIES,AVG_INJURIES,AVG_ALARMS,SUPPORT,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,POPULATION,POPESTIMATE2020,POPESTIMATE2021,TYPE
12720,"NEW YORK CITY,NY",40.779851,-73.960545,80.75,,14684,103653,10502,4940,32168,213532,0.0,0.0,0.000431,0.025901,0.0,452378,,,,,,,,,,,,,,
12727,"NEW YORK,NY",40.786904,-73.959607,81.421875,67.460784,0,1,0,0,0,1,0.0,0.0,0.0,0.0,0.0,1,162.0,36.0,0.0,51000.0,0.0,0.0,0.0,A,NEW YORK,NY,8804190.0,8772978.0,8467513.0,CITY


In [6]:
merged_df = merged_df.sort_values('POPULATION', ascending=False)
merged_df = merged_df.drop_duplicates(subset='CITYSTATE', keep='first')

Calculate nulls per column.

In [7]:
merged_df.isnull().sum() / merged_df.shape[0]

CITYSTATE                0.000000
LATITUDE                 0.000000
LONGITUDE                0.000000
AVG_SCORE_MULTIFAMILY    0.132364
AVG_SCORE_PUBLIC         0.587000
COUNT_111                0.000000
COUNT_113                0.000000
COUNT_131                0.000000
COUNT_151                0.000000
COUNT_142                0.000000
TOTAL_INCIDENT_COUNT     0.000000
AVG_SPREAD               0.000000
AVG_MONEY_LOST           0.000000
AVG_FATALITIES           0.000000
AVG_INJURIES             0.000000
AVG_ALARMS               0.000000
SUPPORT                  0.000000
SUMLEV                   0.101665
STATE                    0.101665
COUNTY                   0.101665
PLACE                    0.101665
COUSUB                   0.101665
CONCIT                   0.101665
PRIMGEO_FLAG             0.101665
FUNCSTAT                 0.101665
NAME                     0.101665
STNAME                   0.101665
POPULATION               0.101665
POPESTIMATE2020          0.101665
POPESTIMATE202

Drop cities where population is null.

In [8]:
merged_df = merged_df[merged_df.POPULATION.notna()]

In [9]:
def adjust_by_population(count: float, population: float):
    if count == 0:
        return 0.0

    if math.isnan(count) or math.isnan(population):
        return 'Not available.'
    
    if count and population:
        return count / population

columns_to_adjust = [
    'COUNT_111',
    'COUNT_113',
    'COUNT_131',
    'COUNT_151',
    'COUNT_142',
    'TOTAL_INCIDENT_COUNT'
]

for column in columns_to_adjust:
    merged_df[column + '_ADJ'] = merged_df \
        .apply(lambda x: adjust_by_population(x[column], x['POPULATION']), axis=1)

Drop unnecessary columns to reduce size an complexity for exporting.

In [10]:
COLUMNS_TO_DROP = [
    'COUNT_111', 
    'COUNT_113', 
    'COUNT_131', 
    'COUNT_151', 
    'COUNT_142',
    'TOTAL_INCIDENT_COUNT',
    'SUMLEV',
    'STATE',
    'COUNTY',
    'PLACE',
    'COUSUB',
    'CONCIT',
    'PRIMGEO_FLAG',
    'FUNCSTAT',
    'POPESTIMATE2020',
    'POPESTIMATE2021',
    
]
df = merged_df.drop(labels=COLUMNS_TO_DROP, axis=1)

Clean up names for export.

In [11]:
df = df.rename(columns={
    'NAME': 'CITY',
    'STNAME': 'STATE'
})

Calculate percentile ranks for display on dashboard.

In [12]:
selected_cols = [
    'AVG_SCORE_MULTIFAMILY', 
    'AVG_SCORE_PUBLIC', 
    'AVG_SPREAD', 
    'AVG_MONEY_LOST', 
    'AVG_FATALITIES', 
    'AVG_INJURIES', 
    'AVG_ALARMS',
    'POPULATION', 
    'SUPPORT',
    'COUNT_111_ADJ', 
    'COUNT_113_ADJ', 
    'COUNT_131_ADJ', 
    'COUNT_151_ADJ', 
    'COUNT_142_ADJ', 
    'TOTAL_INCIDENT_COUNT_ADJ']

percentile_df = df[selected_cols].rank(pct=True)

Add the percentile ranks to our dataframe.

In [13]:
df = pd.concat([df, percentile_df.add_suffix('_PERCENTILE')], axis=1)

Export to JSON.

In [157]:
df.to_json('../dashboard/dashboard.json', orient='records')