In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from rapidfuzz import process, fuzz
from tqdm import tqdm
from geopy.distance import geodesic

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

tqdm.pandas()

PROCESSED = '../data/processed/'
RAW = '../data/raw/'

# Import Census placenames

Because we're aggregating our data at the municipal level, we'll match everything together using the Census gazetteer file. (Census gazetteer data [page](https://www.census.gov/geographies/reference-files/time-series/geo/gazetteer-files.html). Direct download [link](https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2022_Gazetteer/2022_Gaz_place_national.zip).)

In [3]:
places = pd.read_csv(RAW + 'census/2022_Gaz_place_national/2022_Gaz_place_national.txt', sep='\t')
places.sample(n=5, random_state=538)

Unnamed: 0,USPS,GEOID,ANSICODE,NAME,LSAD,FUNCSTAT,ALAND,AWATER,ALAND_SQMI,AWATER_SQMI,INTPTLAT,INTPTLONG
12401,MA,2501370,582323,Amherst Town city,25,A,71546035,340031,27.624,0.131,42.363534,-72.507291
28185,TX,4862804,2410981,Rockport city,25,A,42852045,11777462,16.545,4.547,28.047649,-97.049971
6897,IL,1726012,2398874,Fieldon village,47,A,524356,0,0.202,0.0,39.108674,-90.499564
25017,PA,4276184,1215329,Taylor borough,21,A,13473208,0,5.202,0.0,41.395772,-75.714713
9811,IA,1977970,2397022,Thurman city,25,A,1462404,0,0.565,0.0,40.820118,-95.748813


Column definitions.

* USPS: United States Postal Service state abbreviation. This column represents the state in which the place is located.
* GEOID: Geographic Identifier. This is a unique identifier that the Census Bureau assigns to each geographic entity in the United States.
* ANSICODE: The ANSI (American National Standards Institute) code for the place. These codes were previously known as FIPS (Federal Information Processing Standards) codes.
* NAME: The name of the place.
* LSAD: Legal/Statistical Area Description code. This is a code that describes the type of geographic entity (e.g., city, town, village, borough).
* FUNCSTAT: Functional status of the entity. This indicates whether the entity is legally active (A), legally inactive but continues to function as a statistical entity (S), or has no separate functioning government and exists only as a statistical entity (F).
* ALAND: The land area of the place in square meters.
* AWATER: The water area of the place in square meters.
* ALAND_SQMI: The land area of the place in square miles. This is the same as the ALAND column but in different units.
* AWATER_SQMI: The water area of the place in square miles. This is the same as the AWATER column but in different units.
* INTPTLAT: The latitude of the internal point (centroid) of the entity.
* INTPTLONG: The longitude of the internal point (centroid) of the entity.

Standardize place names.

In [4]:
places['NAME'] = places.NAME.str.upper()

Place names have a suffix indicating CITY/TOWN/etc.

In [5]:
places['PLACE_TYPE'] = places.NAME.apply(lambda x: x.split(' ')[-1])
places['NAME'] = places.NAME.apply(lambda x: ' '.join(x.split(' ')[:-1]))

Once we isolate the place types, we see that a few types dominate in our dataset. 

CDP is a "Census Designated Place," which is defined by localities, and often represent unincorporated communities.

In [6]:
places \
    .groupby('PLACE_TYPE') \
    .size() \
    .sort_values(ascending=False)

PLACE_TYPE
CDP             12376
CITY            10197
TOWN             4345
VILLAGE          3731
BOROUGH          1218
COMUNIDAD         214
URBANA             78
(BALANCE)           8
GOVERNMENT          5
TOWNSHIP            5
COUNTY              4
MUNICIPALITY        4
CORPORATION         1
PRINCETON           1
dtype: int64

All states are represented, plus D.C. and Puerto Rico.

In [7]:
places.USPS.unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'PR'], dtype=object)

We'll filter and standardize our DataFrame to get ready for matching with our other datasets.

In [8]:
places = places.rename(columns={'USPS': 'STATE'})

In [9]:
places['CITYSTATE'] = places.NAME.str.upper() + ',' + places.STATE.str.upper()

In [10]:
places.columns = [x.strip() for x in places.columns]

In [11]:
places = places.loc[:, ['CITYSTATE', 'NAME', 'STATE', 'PLACE_TYPE', 'INTPTLONG', 'INTPTLAT']]

Finally, check for duplicates.

In [12]:
places[places.duplicated(subset=['CITYSTATE', 'PLACE_TYPE'], keep=False)].sample()

Unnamed: 0,CITYSTATE,NAME,STATE,PLACE_TYPE,INTPTLONG,INTPTLAT
23671,"COALDALE,PA",COALDALE,PA,BOROUGH,-78.21661,40.167359


# Merge REAC data

In [13]:
reac_13_18 = pd.read_csv(PROCESSED + 'reac_13-18.csv', sep=',')

In [14]:
reac_placenames = reac_13_18.groupby(by='CITYSTATE').size().reset_index()

In [15]:
reac_placenames['NAME'] = reac_placenames.CITYSTATE.apply(lambda x: x.split(',')[0])
reac_placenames['STATE'] = reac_placenames.CITYSTATE.apply(lambda x: x.split(',')[1])

### Fuzzy string matching

For every place in places, find all matching name strings in the REAC dataset. Fuzzy string matching will allow us to find similar names that aren't exactly the same.

In [16]:
def match_names(row, choices_df, scorer=fuzz.token_set_ratio, limit=30, score_cutoff=85):
    # Filter choices_df to include only rows with the same 'STATE'
    same_state_choices = choices_df[choices_df['STATE'] == row['STATE']]['NAME'].tolist()

    # Return matching 'NAME' values from filtered dataframe
    return process.extract(row['NAME'], same_state_choices, scorer=scorer, limit=limit, score_cutoff=score_cutoff)

# Apply function to each row of 'places' DataFrame
places['matches'] = places.progress_apply(match_names, args=(reac_placenames,), axis=1)

100%|██████████| 32187/32187 [01:51<00:00, 288.55it/s]


There are about 600 places from the Gazeteer file that match more than one place in the REAC data. Manually inspect these to figure out where there are potential errors.

In [17]:
places[places['matches'].apply(len) > 1] \
    .drop(columns=['CITYSTATE', 'INTPTLONG', 'INTPTLAT'])

Unnamed: 0,NAME,STATE,PLACE_TYPE,matches
62,BREWTON,AL,CITY,"[(BREWTON, 100.0, 27), (EAST BREWTON, 100.0, 58)]"
98,CLANTON,AL,CITY,"[(CLANTON, 100.0, 41), (CLAYTON, 85.71428571428571, 42)]"
101,CLAYTON,AL,TOWN,"[(CLAYTON, 100.0, 42), (CLANTON, 85.71428571428571, 41)]"
111,COLUMBIA,AL,TOWN,"[(COLUMBIA, 100.0, 44), (COLUMBIANA, 88.88888888888889, 45)]"
112,COLUMBIANA,AL,CITY,"[(COLUMBIANA, 100.0, 45), (COLUMBIA, 88.88888888888889, 44)]"
...,...,...,...,...
31538,SHEBOYGAN,WI,CITY,"[(SHEBOYGAN FALLS, 100.0, 217), (SHEBOYGAN, 100.0, 218)]"
31539,SHEBOYGAN FALLS,WI,CITY,"[(SHEBOYGAN FALLS, 100.0, 217), (SHEBOYGAN, 100.0, 218)]"
31556,SOUTH MILWAUKEE,WI,CITY,"[(MILWAUKEE, 100.0, 144), (SOUTH MILWAUKEE, 100.0, 228)]"
31651,WEST MILWAUKEE,WI,VILLAGE,"[(MILWAUKEE, 100.0, 144), (WEST MILWAUKEE, 100.0, 268)]"


There are about 50 place names from the Gazeteer file that have potential problems (based on manual inspection of all rows.) 

Create a dict with the changes we'll make to the REAC dataset to make it compatible. All rows in the REAC data matching a key in this dict will be renamed to the corresponding value.

Procedure:
* If there is an exact match and a match that seems like a derivation (e.g. Tuskegee, AL and Tuskegee Institute, AL), rename the derivation.
* If none of the near matches from the REAC are exact matches, and all the near matches in the REAC are valid placenames, do nothing.
* If there is one exact match, and the other matches don't seem like derivations, do nothing (i.e. use the exact match only for the spatial join).

In [18]:
reac_renames = {
    'TUSKEGEE INSTITUTE,AL': 'TUSKEGEE,AL',
    'W HARTFORD,CT': 'WEST HARTFORD,CT',
    'E HARTFORD,CT': 'EAST HARTFORD,CT',
    'N FORT MYERS,FL': 'NORTH FORT MYERS,FL',
    'FT LAUDERDALE,FL': 'FORT LAUDERDALE,FL',
    'CLINTON TWP,MI': 'CLINTON TOWNSHIP,MI',
    'JEFFERSON CTY,MO': 'JEFFERSON CITY,MO',
    'NEW YORK CITY,NY': 'NEW YORK,NY',
    'RICHMOND HTS,OH': 'RICHMOND HEIGHTS,OH',
    'MT PLEASANT,TX': 'MOUNT PLEASANT,TX',
    
}

Rename the designated place names in the REAC dataset.

In [19]:
reac_13_18['CITYSTATE'] = reac_13_18.CITYSTATE.progress_apply(
    lambda x: reac_renames[x] 
        if x in reac_renames 
        else x
)

100%|██████████| 64195/64195 [00:00<00:00, 875176.64it/s]


### Remove duplicates

Aggregate the REAC data at the placename level.

In [20]:
# Calculate the average latitude and longitude for each CITYSTATE
avg_coordinates = reac_13_18 \
    .loc[:, ['CITYSTATE', 'LATITUDE', 'LONGITUDE']] \
    .groupby('CITYSTATE') \
    .mean() \
    .reset_index()

# Calculate the average inspection scores for each INSPECTION_TYPE
avg_scores = reac_13_18 \
    .loc[:, ['CITYSTATE', 'INSPECTION_SCORE', 'INSPECTION_TYPE']] \
    .groupby(['CITYSTATE', 'INSPECTION_TYPE']) \
    .mean() \
    .reset_index() \
    .pivot(index='CITYSTATE', columns='INSPECTION_TYPE', values='INSPECTION_SCORE') \
    .reset_index() \
    .rename(columns={
        'MULTIFAMILY': 'AVG_SCORE_MULTIFAMILY',
        'PUBLIC': 'AVG_SCORE_PUBLIC'
    }) \
    .rename_axis(None, axis=1)

# Join the two DataFrames together on the CITYSTATE column
reac_13_18 = avg_coordinates.merge(avg_scores, on='CITYSTATE')

In [21]:
print('REAC places:', len(reac_13_18))

REAC places: 8418


Create preliminary merge between REAC and Gazeteer places.

In [22]:
reac_merge = pd.merge(left=places, right=reac_13_18, how='left', on='CITYSTATE')

Calculate distance between the centroid coordinates in each dataset. This will help pick between duplicates.

In [23]:
reac_merge['distance'] = reac_merge.progress_apply(
    # Haversine distance between the two centroids.
    lambda x: geodesic((x['LATITUDE'], x['LONGITUDE']), (x['INTPTLAT'], x['INTPTLONG'])).miles
    if pd.notna(x['LATITUDE']) and pd.notna(x['LONGITUDE'])
    else None,
    axis=1
)

100%|██████████| 32187/32187 [00:01<00:00, 17872.27it/s]


Find the rows with duplicate values (meaning that the left join on the Gazeteer place name had more than one match).

In [24]:
dups = reac_merge[
    (reac_merge.LATITUDE.notna()) & 
    (reac_merge.LONGITUDE.notna()) &
    (reac_merge.duplicated(subset=['LATITUDE', 'LONGITUDE'], keep=False))
]
print('Duplicate rows:', len(dups))
display(dups.head(6))

Duplicate rows: 102


Unnamed: 0,CITYSTATE,NAME,STATE,PLACE_TYPE,INTPTLONG,INTPTLAT,matches,LATITUDE,LONGITUDE,AVG_SCORE_MULTIFAMILY,AVG_SCORE_PUBLIC,distance
1042,"COTTONWOOD,AZ",COTTONWOOD,AZ,CDP,-109.890109,36.072152,"[(COTTONWOOD, 100.0, 8)]",34.723079,-112.012107,95.333333,,151.642929
1043,"COTTONWOOD,AZ",COTTONWOOD,AZ,CITY,-111.984001,34.750844,"[(COTTONWOOD, 100.0, 8)]",34.723079,-112.012107,95.333333,,2.494184
1926,"SALEM,AR",SALEM,AR,CITY,-91.821706,36.370416,"[(SALEM, 100.0, 147)]",36.373771,-91.835589,84.5,72.0,0.807943
1927,"SALEM,AR",SALEM,AR,CDP,-92.561492,34.630899,"[(SALEM, 100.0, 147)]",36.373771,-91.835589,84.5,72.0,126.931724
2212,"BURBANK,CA",BURBANK,CA,CITY,-118.326405,34.190079,"[(BURBANK, 100.0, 40)]",34.182162,-118.308008,89.666667,,1.186717
2213,"BURBANK,CA",BURBANK,CA,CDP,-121.930608,37.321097,"[(BURBANK, 100.0, 40)]",34.182162,-118.308008,89.666667,,297.071744


For each duplicate, keep the row where the centroid coordinate is the closest between the REAC and Gazeteer datasets, and drop the rest. (To accomplish this, store the names of the rows to drop.)

In [25]:
dups_to_drop = dups \
    .sort_values(by='distance') \
    .drop_duplicates(subset='CITYSTATE', keep='last') \
    .loc[:, ['CITYSTATE', 'PLACE_TYPE', 'INTPTLONG', 'INTPTLAT']]

Filter out the rows in dups_to_drop.

In [26]:
reac_merge = reac_merge[
    (~reac_merge.CITYSTATE.isin(dups_to_drop.CITYSTATE)) | 
    (~reac_merge.PLACE_TYPE.isin(dups_to_drop.PLACE_TYPE)) | 
    (~reac_merge.INTPTLONG.isin(dups_to_drop.INTPTLONG)) | 
    (~reac_merge.INTPTLAT.isin(dups_to_drop.INTPTLAT))
]

# Merge NFIRS data (do this first)

In [27]:
nfirs_top5 = pd.read_csv(PROCESSED + 'top5_by_type_13_18.csv')
nfirs_counts = pd.read_csv(PROCESSED + 'total_incident_count_13_18.csv')
nfirs_other = pd.read_csv(PROCESSED + 'other_nfirs_13_18.csv')

In [33]:
nfirs_top5[nfirs_top5.CITYSTATE.str.contains('NEW YORK')]

Unnamed: 0,CITYSTATE,COUNT_111,COUNT_113,COUNT_131,COUNT_151,COUNT_142
39921,"NEW YORK CITY,NY",14684,103653,10502,4940,32168
39922,"NEW YORK MILLS,MN",37,0,3,2,0
39923,"NEW YORK MILLS,NY",18,10,3,1,2
39924,"NEW YORK,NJ",1,0,0,0,0
39925,"NEW YORK,NY",0,1,0,0,0
39926,"NEW YORK,TX",1,0,1,1,0
59530,"WEAT NEW YORK,NJ",0,1,0,0,0
59837,"WEST NEW YORK,NJ",0,1,0,0,0
60174,"WEST NEW YORK,NJ",64,336,23,35,27
60513,"WESTNEW YORK,NJ",1,0,0,0,0


# Merge Census population data

In [179]:
census = pd.read_csv(PROCESSED + 'populations_clean.csv')

Filter census so that if there are multiple rows with the same NAME, STATE, PLACE_TYPE, and POPULATION, we treat these as a duplicate and remove all but one.

In [205]:
census = census.drop_duplicates(
    subset=['CITYSTATE', 'PLACE_TYPE', 'POPULATION'], 
    keep='first'
)

In [207]:
def match_census(row, choices_df, scorer=fuzz.token_set_ratio, limit=30, score_cutoff=85):
    # Filter choices_df to include only rows with the same 'STATE'
    same_state_choices = choices_df[
        (choices_df['STNAME'] == row['STATE']) &
        (choices_df['PLACE_TYPE'] == row['PLACE_TYPE'])
    ]['NAME'].tolist()

    # Return matching 'NAME' values from filtered dataframe
    return process.extract(
        row['NAME'], 
        same_state_choices, 
        scorer=scorer, 
        limit=limit, 
        score_cutoff=score_cutoff
    )

# Apply function to each row of 'places' DataFrame
reac_merge['census_matches'] = reac_merge.progress_apply(match_census, args=(census,), axis=1)

100%|██████████| 32136/32136 [02:33<00:00, 209.25it/s]


In [218]:
reac_merge[
    (reac_merge.census_matches.apply(len) > 1) &
    (reac_merge.matches.apply(len) > 0)
    
]
census[census.NAME.str.contains('WAUPUN')]

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,POPULATION,POPESTIMATE2020,POPESTIMATE2021,PLACE_TYPE,CITYSTATE
66821,162,55,0,84425,0,0,0,A,WAUPUN,WI,11515,11500,11515,CITY,"WAUPUN,WI"
67522,61,55,27,0,84425,0,0,F,WAUPUN,WI,7958,7947,7948,CITY,"WAUPUN,WI"
67720,61,55,39,0,84425,0,0,F,WAUPUN,WI,3557,3553,3567,CITY,"WAUPUN,WI"
67721,61,55,39,0,84450,0,1,A,WAUPUN,WI,1377,1374,1383,TOWN,"WAUPUN,WI"


In [221]:
census['POPULATION'] = census['POPULATION'].astype(int)

# Grouping by 'CITYSTATE' and 'PLACE_TYPE'
grouped = census.groupby(['CITYSTATE', 'PLACE_TYPE'])

# List to store rows that meet the condition
valid_rows = []

for name, group in grouped:
    # Skip if group has only one row
    if len(group) == 1:
        continue

    # Get maximum population in group
    max_pop = group['POPULATION'].max()

    # Get the sum of populations of all other rows
    sum_other_rows = group.loc[group['POPULATION'] != max_pop, 'POPULATION'].sum()

    # Check if max population equals sum of other rows' population
    if max_pop == sum_other_rows:
        # Append the row with max population to the valid_rows list
        valid_rows.append(group.loc[group['POPULATION'] == max_pop])

# Concatenate valid rows into a new DataFrame
filtered_df = pd.concat(valid_rows)

# Reset the index of the new DataFrame
filtered_df.reset_index(drop=True, inplace=True)

filtered_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  census['POPULATION'] = census['POPULATION'].astype(int)


Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,POPULATION,POPESTIMATE2020,POPESTIMATE2021,PLACE_TYPE,CITYSTATE
0,162,55,0,100,0,0,0,A,ABBOTSFORD,WI,2266,2263,2325,CITY,"ABBOTSFORD,WI"
1,162,42,0,364,0,0,0,A,ADAMSTOWN,PA,1943,1948,1981,BOROUGH,"ADAMSTOWN,PA"
2,162,55,0,2375,0,0,0,A,APPLETON,WI,75315,75300,74854,CITY,"APPLETON,WI"
3,162,46,0,2180,0,0,0,A,ARLINGTON,SD,915,920,921,CITY,"ARLINGTON,SD"
4,162,55,0,3225,0,0,0,A,ASHLAND,WI,7887,7861,7918,CITY,"ASHLAND,WI"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,162,55,0,86925,0,0,0,A,WHITEWATER,WI,14384,14375,14351,CITY,"WHITEWATER,WI"
128,162,38,0,86580,0,0,0,A,WILTON,ND,720,718,716,CITY,"WILTON,ND"
129,162,55,0,88150,0,0,0,A,WISCONSIN DELLS,WI,2943,2964,3198,CITY,"WISCONSIN DELLS,WI"
130,162,55,0,89150,0,0,0,A,WRIGHTSTOWN,WI,3186,3215,3237,VILLAGE,"WRIGHTSTOWN,WI"


### Finish the above section ^^^

Note: For the Census, you should do some grouping, where you detect if the row in the group with the highest population is equal to the sum of the other rows (indicated sub-geographies). This could narrow down duplicates a lot.

# Merge datasets

In [70]:
PATHS_TO_MERGE = [
    'reac_13-18.csv',
    'top5_by_type_13_18.csv',
    'total_incident_count_13_18.csv',
    'other_nfirs_13_18.csv',
    'populations_clean.csv'
]

Load the DataFrames into a list.

In [71]:
dfs = [
    pd.read_csv(PROCESSED + path) \
    for path in PATHS_TO_MERGE
]

Merge the DataFrames based on NFIRS and REAC data.

In [72]:
merged_df = pd.merge(dfs[0], dfs[1], on='CITYSTATE', how='inner')
merged_df = pd.merge(merged_df, dfs[2], on='CITYSTATE', how='inner')
merged_df = pd.merge(merged_df, dfs[3], on='CITYSTATE', how='inner')
merged_df = pd.merge(merged_df, dfs[4], on='CITYSTATE', how='left')

# Clean and format merged dataset

We have significant numbers of duplicates, because the census reports data on locations at different levels of granularity. For each match, we'll keep the one with the highest population.

In [73]:
merged_df[
    merged_df.CITYSTATE.str.contains('NEW YORK') &
    ~merged_df.CITYSTATE.str.contains('MILLS') &
    ~merged_df.CITYSTATE.str.contains('WEST')

    ]

Unnamed: 0,CITYSTATE,LATITUDE,LONGITUDE,AVG_SCORE_MULTIFAMILY,AVG_SCORE_PUBLIC,COUNT_111,COUNT_113,COUNT_131,COUNT_151,COUNT_142,TOTAL_INCIDENT_COUNT,AVG_SPREAD,AVG_MONEY_LOST,AVG_FATALITIES,AVG_INJURIES,SUPPORT,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,POPULATION,POPESTIMATE2020,POPESTIMATE2021,TYPE
12719,"NEW YORK CITY,NY",40.779851,-73.960545,80.75,,14684,103653,10502,4940,32168,213532,0.0,0.0,0.00033,0.019843,590475,,,,,,,,,,,,,,
12726,"NEW YORK,NY",40.786904,-73.959607,81.421875,67.460784,0,1,0,0,0,1,0.0,0.0,0.0,0.0,10,162.0,36.0,0.0,51000.0,0.0,0.0,0.0,A,NEW YORK,NY,8804190.0,8772978.0,8467513.0,CITY


In [74]:
merged_df = merged_df.sort_values('POPULATION', ascending=False)
merged_df = merged_df.drop_duplicates(subset='CITYSTATE', keep='first')

Calculate nulls per column.

In [75]:
merged_df.isnull().sum() / merged_df.shape[0]

CITYSTATE                0.000000
LATITUDE                 0.000000
LONGITUDE                0.000000
AVG_SCORE_MULTIFAMILY    0.132380
AVG_SCORE_PUBLIC         0.586948
COUNT_111                0.000000
COUNT_113                0.000000
COUNT_131                0.000000
COUNT_151                0.000000
COUNT_142                0.000000
TOTAL_INCIDENT_COUNT     0.000000
AVG_SPREAD               0.000000
AVG_MONEY_LOST           0.000000
AVG_FATALITIES           0.000000
AVG_INJURIES             0.000000
SUPPORT                  0.000000
SUMLEV                   0.101554
STATE                    0.101554
COUNTY                   0.101554
PLACE                    0.101554
COUSUB                   0.101554
CONCIT                   0.101554
PRIMGEO_FLAG             0.101554
FUNCSTAT                 0.101554
NAME                     0.101554
STNAME                   0.101554
POPULATION               0.101554
POPESTIMATE2020          0.101554
POPESTIMATE2021          0.101554
TYPE          

Drop cities where population is null.

In [76]:
merged_df = merged_df[merged_df.POPULATION.notna()]

In [77]:
def adjust_by_population(count: float, population: float):
    if count == 0:
        return 0.0

    if math.isnan(count) or math.isnan(population):
        return 'Not available.'
    
    if count and population:
        return count / population

columns_to_adjust = [
    'COUNT_111',
    'COUNT_113',
    'COUNT_131',
    'COUNT_151',
    'COUNT_142',
    'TOTAL_INCIDENT_COUNT'
]

for column in columns_to_adjust:
    merged_df[column + '_ADJ'] = merged_df \
        .apply(lambda x: adjust_by_population(x[column], x['POPULATION']), axis=1)

Drop unnecessary columns to reduce size an complexity for exporting.

In [78]:
COLUMNS_TO_DROP = [
    'COUNT_111', 
    'COUNT_113', 
    'COUNT_131', 
    'COUNT_151', 
    'COUNT_142',
    'TOTAL_INCIDENT_COUNT',
    'SUMLEV',
    'STATE',
    'COUNTY',
    'PLACE',
    'COUSUB',
    'CONCIT',
    'PRIMGEO_FLAG',
    'FUNCSTAT',
    'POPESTIMATE2020',
    'POPESTIMATE2021',
    
]
df = merged_df.drop(labels=COLUMNS_TO_DROP, axis=1)

Clean up names for export.

In [79]:
df

Unnamed: 0,CITYSTATE,LATITUDE,LONGITUDE,AVG_SCORE_MULTIFAMILY,AVG_SCORE_PUBLIC,AVG_SPREAD,AVG_MONEY_LOST,AVG_FATALITIES,AVG_INJURIES,SUPPORT,NAME,STNAME,POPULATION,TYPE,COUNT_111_ADJ,COUNT_113_ADJ,COUNT_131_ADJ,COUNT_151_ADJ,COUNT_142_ADJ,TOTAL_INCIDENT_COUNT_ADJ
12726,"NEW YORK,NY",40.786904,-73.959607,81.421875,67.460784,0.000000,0.000000,0.000000,0.000000,10,NEW YORK,NY,8804190.0,CITY,0.000000,1.135823e-07,0.000000,0.000000,0.000000,1.135823e-07
10428,"LOS ANGELES,CA",34.035896,-118.280411,83.661710,90.703704,0.023631,8033.523961,0.000694,0.003123,74935,LOS ANGELES,CA,3893986.0,CITY,0.000859,1.127123e-03,0.001623,0.000126,0.001539,9.789198e-03
3313,"CHICAGO,IL",41.857026,-87.647237,81.193741,77.160622,0.000000,1823.956843,0.000797,0.007762,180620,CHICAGO,IL,2747231.0,CITY,0.002720,5.109144e-03,0.003352,0.000222,0.002838,2.420656e-02
8361,"HOUSTON,TX",29.769034,-95.407266,81.088083,90.900000,0.002370,5685.706329,0.000821,0.005741,103473,HOUSTON,TX,2302792.0,CITY,0.004494,1.385275e-03,0.003619,0.000480,0.002106,1.958318e-02
14229,"PHOENIX,AZ",33.493673,-112.070861,88.753333,82.485714,0.000000,4406.245392,0.000000,0.000072,55177,PHOENIX,AZ,1607739.0,CITY,0.003629,9.112176e-04,0.000167,0.000219,0.003774,1.949819e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10406,"LOOKEBA,OK",35.399200,-98.385700,,82.000000,0.000000,0.000000,0.000000,0.000000,6,LOOKEBA,OK,83.0,TOWN,0.012048,0.000000e+00,0.000000,0.012048,0.000000,6.024096e-02
6645,"GAYLORD,KS",39.661094,-98.816327,,85.000000,0.000000,4106.944444,0.000000,0.000000,36,GAYLORD,KS,82.0,CITY,0.048780,1.219512e-02,0.012195,0.048780,0.036585,3.780488e-01
3724,"COGSWELL,ND",46.106903,-97.782284,94.000000,,0.000000,0.000000,0.000000,0.000000,6,COGSWELL,ND,73.0,CITY,0.000000,0.000000e+00,0.013699,0.000000,0.000000,2.739726e-02
2299,"BRUCETON MILLS,WV",39.688533,-79.546500,63.000000,,0.000000,329.670330,0.000000,0.000000,273,BRUCETON MILLS,WV,64.0,TOWN,0.671875,3.125000e-02,0.468750,0.375000,0.015625,2.359375e+00


In [80]:
df = df.rename(columns={
    'NAME': 'CITY',
    'STNAME': 'STATE'
})

Calculate percentile ranks for display on dashboard.

In [81]:
selected_cols = [
    'AVG_SCORE_MULTIFAMILY', 
    'AVG_SCORE_PUBLIC', 
    'AVG_SPREAD', 
    'AVG_MONEY_LOST', 
    'AVG_FATALITIES', 
    'AVG_INJURIES', 
    'POPULATION', 
    'SUPPORT',
    'COUNT_111_ADJ', 
    'COUNT_113_ADJ', 
    'COUNT_131_ADJ', 
    'COUNT_151_ADJ', 
    'COUNT_142_ADJ', 
    'TOTAL_INCIDENT_COUNT_ADJ'
]

percentile_df = df[selected_cols].rank(pct=True)

Add the percentile ranks to our dataframe.

In [82]:
df = pd.concat([df, percentile_df.add_suffix('_PERCENTILE')], axis=1)

Export to JSON.

In [83]:
df.to_json('../dashboard/dashboard.json', orient='records')