In [69]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import geopandas as gpd
import gzip 

pd.set_option('display.max_columns', None)

WORKING_DIRECTORY = '../data/processed/'

# Merge datasets

In [70]:
PATHS_TO_MERGE = [
    'reac_13-18.csv',
    'top5_by_type_13_18.csv',
    'total_incident_count_13_18.csv',
    'other_nfirs_13_18.csv',
    'populations_clean.csv'
]

Load the DataFrames into a list.

In [71]:
dfs = [
    pd.read_csv(WORKING_DIRECTORY + path) \
    for path in PATHS_TO_MERGE
]

Merge the DataFrames based on NFIRS and REAC data.

In [72]:
merged_df = pd.merge(dfs[0], dfs[1], on='CITYSTATE', how='inner')
merged_df = pd.merge(merged_df, dfs[2], on='CITYSTATE', how='inner')
merged_df = pd.merge(merged_df, dfs[3], on='CITYSTATE', how='inner')
merged_df = pd.merge(merged_df, dfs[4], on='CITYSTATE', how='left')

# Clean and format merged dataset

We have significant numbers of duplicates, because the census reports data on locations at different levels of granularity. For each match, we'll keep the one with the highest population.

In [73]:
merged_df[
    merged_df.CITYSTATE.str.contains('NEW YORK') &
    ~merged_df.CITYSTATE.str.contains('MILLS') &
    ~merged_df.CITYSTATE.str.contains('WEST')

    ]

Unnamed: 0,CITYSTATE,LATITUDE,LONGITUDE,AVG_SCORE_MULTIFAMILY,AVG_SCORE_PUBLIC,COUNT_111,COUNT_113,COUNT_131,COUNT_151,COUNT_142,TOTAL_INCIDENT_COUNT,AVG_SPREAD,AVG_MONEY_LOST,AVG_FATALITIES,AVG_INJURIES,SUPPORT,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,POPULATION,POPESTIMATE2020,POPESTIMATE2021,TYPE
12719,"NEW YORK CITY,NY",40.779851,-73.960545,80.75,,14684,103653,10502,4940,32168,213532,0.0,0.0,0.00033,0.019843,590475,,,,,,,,,,,,,,
12726,"NEW YORK,NY",40.786904,-73.959607,81.421875,67.460784,0,1,0,0,0,1,0.0,0.0,0.0,0.0,10,162.0,36.0,0.0,51000.0,0.0,0.0,0.0,A,NEW YORK,NY,8804190.0,8772978.0,8467513.0,CITY


In [74]:
merged_df = merged_df.sort_values('POPULATION', ascending=False)
merged_df = merged_df.drop_duplicates(subset='CITYSTATE', keep='first')

Calculate nulls per column.

In [75]:
merged_df.isnull().sum() / merged_df.shape[0]

CITYSTATE                0.000000
LATITUDE                 0.000000
LONGITUDE                0.000000
AVG_SCORE_MULTIFAMILY    0.132380
AVG_SCORE_PUBLIC         0.586948
COUNT_111                0.000000
COUNT_113                0.000000
COUNT_131                0.000000
COUNT_151                0.000000
COUNT_142                0.000000
TOTAL_INCIDENT_COUNT     0.000000
AVG_SPREAD               0.000000
AVG_MONEY_LOST           0.000000
AVG_FATALITIES           0.000000
AVG_INJURIES             0.000000
SUPPORT                  0.000000
SUMLEV                   0.101554
STATE                    0.101554
COUNTY                   0.101554
PLACE                    0.101554
COUSUB                   0.101554
CONCIT                   0.101554
PRIMGEO_FLAG             0.101554
FUNCSTAT                 0.101554
NAME                     0.101554
STNAME                   0.101554
POPULATION               0.101554
POPESTIMATE2020          0.101554
POPESTIMATE2021          0.101554
TYPE          

Drop cities where population is null.

In [76]:
merged_df = merged_df[merged_df.POPULATION.notna()]

In [77]:
def adjust_by_population(count: float, population: float):
    if count == 0:
        return 0.0

    if math.isnan(count) or math.isnan(population):
        return 'Not available.'
    
    if count and population:
        return count / population

columns_to_adjust = [
    'COUNT_111',
    'COUNT_113',
    'COUNT_131',
    'COUNT_151',
    'COUNT_142',
    'TOTAL_INCIDENT_COUNT'
]

for column in columns_to_adjust:
    merged_df[column + '_ADJ'] = merged_df \
        .apply(lambda x: adjust_by_population(x[column], x['POPULATION']), axis=1)

Drop unnecessary columns to reduce size an complexity for exporting.

In [78]:
COLUMNS_TO_DROP = [
    'COUNT_111', 
    'COUNT_113', 
    'COUNT_131', 
    'COUNT_151', 
    'COUNT_142',
    'TOTAL_INCIDENT_COUNT',
    'SUMLEV',
    'STATE',
    'COUNTY',
    'PLACE',
    'COUSUB',
    'CONCIT',
    'PRIMGEO_FLAG',
    'FUNCSTAT',
    'POPESTIMATE2020',
    'POPESTIMATE2021',
    
]
df = merged_df.drop(labels=COLUMNS_TO_DROP, axis=1)

Clean up names for export.

In [79]:
df

Unnamed: 0,CITYSTATE,LATITUDE,LONGITUDE,AVG_SCORE_MULTIFAMILY,AVG_SCORE_PUBLIC,AVG_SPREAD,AVG_MONEY_LOST,AVG_FATALITIES,AVG_INJURIES,SUPPORT,NAME,STNAME,POPULATION,TYPE,COUNT_111_ADJ,COUNT_113_ADJ,COUNT_131_ADJ,COUNT_151_ADJ,COUNT_142_ADJ,TOTAL_INCIDENT_COUNT_ADJ
12726,"NEW YORK,NY",40.786904,-73.959607,81.421875,67.460784,0.000000,0.000000,0.000000,0.000000,10,NEW YORK,NY,8804190.0,CITY,0.000000,1.135823e-07,0.000000,0.000000,0.000000,1.135823e-07
10428,"LOS ANGELES,CA",34.035896,-118.280411,83.661710,90.703704,0.023631,8033.523961,0.000694,0.003123,74935,LOS ANGELES,CA,3893986.0,CITY,0.000859,1.127123e-03,0.001623,0.000126,0.001539,9.789198e-03
3313,"CHICAGO,IL",41.857026,-87.647237,81.193741,77.160622,0.000000,1823.956843,0.000797,0.007762,180620,CHICAGO,IL,2747231.0,CITY,0.002720,5.109144e-03,0.003352,0.000222,0.002838,2.420656e-02
8361,"HOUSTON,TX",29.769034,-95.407266,81.088083,90.900000,0.002370,5685.706329,0.000821,0.005741,103473,HOUSTON,TX,2302792.0,CITY,0.004494,1.385275e-03,0.003619,0.000480,0.002106,1.958318e-02
14229,"PHOENIX,AZ",33.493673,-112.070861,88.753333,82.485714,0.000000,4406.245392,0.000000,0.000072,55177,PHOENIX,AZ,1607739.0,CITY,0.003629,9.112176e-04,0.000167,0.000219,0.003774,1.949819e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10406,"LOOKEBA,OK",35.399200,-98.385700,,82.000000,0.000000,0.000000,0.000000,0.000000,6,LOOKEBA,OK,83.0,TOWN,0.012048,0.000000e+00,0.000000,0.012048,0.000000,6.024096e-02
6645,"GAYLORD,KS",39.661094,-98.816327,,85.000000,0.000000,4106.944444,0.000000,0.000000,36,GAYLORD,KS,82.0,CITY,0.048780,1.219512e-02,0.012195,0.048780,0.036585,3.780488e-01
3724,"COGSWELL,ND",46.106903,-97.782284,94.000000,,0.000000,0.000000,0.000000,0.000000,6,COGSWELL,ND,73.0,CITY,0.000000,0.000000e+00,0.013699,0.000000,0.000000,2.739726e-02
2299,"BRUCETON MILLS,WV",39.688533,-79.546500,63.000000,,0.000000,329.670330,0.000000,0.000000,273,BRUCETON MILLS,WV,64.0,TOWN,0.671875,3.125000e-02,0.468750,0.375000,0.015625,2.359375e+00


In [80]:
df = df.rename(columns={
    'NAME': 'CITY',
    'STNAME': 'STATE'
})

Calculate percentile ranks for display on dashboard.

In [81]:
selected_cols = [
    'AVG_SCORE_MULTIFAMILY', 
    'AVG_SCORE_PUBLIC', 
    'AVG_SPREAD', 
    'AVG_MONEY_LOST', 
    'AVG_FATALITIES', 
    'AVG_INJURIES', 
    'POPULATION', 
    'SUPPORT',
    'COUNT_111_ADJ', 
    'COUNT_113_ADJ', 
    'COUNT_131_ADJ', 
    'COUNT_151_ADJ', 
    'COUNT_142_ADJ', 
    'TOTAL_INCIDENT_COUNT_ADJ'
]

percentile_df = df[selected_cols].rank(pct=True)

Add the percentile ranks to our dataframe.

In [82]:
df = pd.concat([df, percentile_df.add_suffix('_PERCENTILE')], axis=1)

Export to JSON.

In [83]:
df.to_json('../dashboard/dashboard.json', orient='records')