In [81]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

pd.set_option('display.max_columns', None)

# Standardize placenames

In [82]:
reac_fixed = pd.read_csv('../data/processed/reac_fixed.csv')
nfirs_fixed = pd.read_csv('../data/processed/nfirs_fixed.csv')
nspire_fixed = pd.read_csv('../data/processed/nspire_fixed.csv')

In [83]:
fix_dict = {}

reac_fixes = reac_fixed[reac_fixed.citystate != reac_fixed.corrected_address] \
    .loc[:, ['corrected_address', 'citystate']] \
    .values.tolist()
for fix in reac_fixes:
    fix_dict[fix[1]] = fix[0]

nfirs_fixes = nfirs_fixed[nfirs_fixed.citystate != nfirs_fixed.corrected_address] \
    .loc[:, ['corrected_address', 'citystate']] \
    .values.tolist()
for fix in nfirs_fixes:
    fix_dict[fix[1]] = fix[0]

nspire_fixes = nspire_fixed[nspire_fixed.citystate != nspire_fixed.corrected_address] \
    .loc[:, ['corrected_address', 'citystate']] \
    .values.tolist()
for fix in nspire_fixes:
    fix_dict[fix[1]] = fix[0]

### Standardize REAC

In [84]:
reac_13_18 = pd.read_csv('../data/processed/reac_13-18.csv')

In [85]:
reac_13_18['CITYSTATE'] = reac_13_18.CITYSTATE.apply(
    lambda x: fix_dict[x]
    if x in fix_dict
    else x
)

In [86]:
# Calculate the average latitude and longitude for each CITYSTATE
avg_coordinates = reac_13_18 \
    .loc[:, ['CITYSTATE', 'LATITUDE', 'LONGITUDE']] \
    .groupby('CITYSTATE') \
    .mean() \
    .reset_index()

# Calculate the average inspection scores for each INSPECTION_TYPE
avg_scores = reac_13_18 \
    .loc[:, ['CITYSTATE', 'INSPECTION_SCORE', 'INSPECTION_TYPE']] \
    .groupby(['CITYSTATE', 'INSPECTION_TYPE']) \
    .mean() \
    .reset_index() \
    .pivot(index='CITYSTATE', columns='INSPECTION_TYPE', values='INSPECTION_SCORE') \
    .reset_index() \
    .rename(columns={
        'MULTIFAMILY': 'AVG_SCORE_MULTIFAMILY',
        'PUBLIC': 'AVG_SCORE_PUBLIC'
    }) \
    .rename_axis(None, axis=1)

# Join the two DataFrames together on the CITYSTATE column
reac_13_18 = avg_coordinates.merge(avg_scores, on='CITYSTATE')

### Standardize NFIRS

For NFIRS data, the data is aggregated at the level of city/locality/town/etc, so if we standardize a name we need to combine metrics relying on an average with a weighted average.

In [87]:
nfirs_loss = pd.read_csv('../data/processed/nfirs_loss.csv')
nfirs_spread = pd.read_csv('../data/processed/nfirs_spread.csv')
nfirs_top5 = pd.read_csv('../data/processed/top5_by_type.csv')
nfirs_counts = pd.read_csv('../data/processed/total_incident_count.csv')

Remove rows without a locality name.

In [88]:
nfirs_loss = nfirs_loss[nfirs_loss.CITYSTATE.notna()]
nfirs_spread = nfirs_spread[nfirs_spread.CITYSTATE.notna()]
nfirs_top5 = nfirs_top5[nfirs_top5.CITYSTATE.notna()]
nfirs_counts = nfirs_counts[nfirs_counts.CITYSTATE.notna()]

Remove empty spaces.

In [89]:
def remove_empty_space(x):
    return str(x).strip().replace(', ', ',').replace(' ,', ',')

nfirs_loss['CITYSTATE'] = nfirs_loss.CITYSTATE.apply(remove_empty_space)
nfirs_spread['CITYSTATE'] = nfirs_spread.CITYSTATE.apply(remove_empty_space)
nfirs_top5['CITYSTATE'] = nfirs_top5.CITYSTATE.apply(remove_empty_space)
nfirs_counts['CITYSTATE'] = nfirs_counts.CITYSTATE.apply(remove_empty_space)

Standardize non-standard names.

In [90]:
def standardize_names(x):
    if x in fix_dict:
        return fix_dict[x]
    return x

nfirs_loss['CITYSTATE'] = nfirs_loss.CITYSTATE.apply(standardize_names)
nfirs_spread['CITYSTATE'] = nfirs_spread.CITYSTATE.apply(standardize_names)
nfirs_top5['CITYSTATE'] = nfirs_top5.CITYSTATE.apply(standardize_names)
nfirs_counts['CITYSTATE'] = nfirs_counts.CITYSTATE.apply(standardize_names)

Aggregate the summed values in the DataFrames by CITYSTATE and YEAR (standardizing names may have introduced duplicates).

In [91]:
nfirs_loss = nfirs_loss.groupby(by=['CITYSTATE', 'YEAR']).sum().reset_index()
nfirs_spread = nfirs_spread.groupby(by=['CITYSTATE', 'YEAR']).sum().reset_index()
nfirs_counts = nfirs_counts.groupby(by=['CITYSTATE', 'YEAR']).sum().reset_index()
nfirs_top5 = nfirs_top5.groupby(by=['CITYSTATE', 'YEAR']).sum().reset_index()

### Standardize Census Populations

In [92]:
pops = pd.read_csv('../data/processed/populations_clean.csv')

In [93]:
max_sumlev = pops.groupby('CITYSTATE')['SUMLEV'].transform(max)
pops = pops[pops['SUMLEV'] == max_sumlev]

In [94]:
pops = pops.drop_duplicates(subset=['CITYSTATE'], keep=False)

In [95]:
pops = pops.loc[:, ['CITYSTATE', 'POPULATION']]

### Standardize NSPIRE

In [96]:
nspire = pd.read_csv('../data/processed/nspire_partial_clean.csv',
                     low_memory=False)

In [97]:
nspire['CITYSTATE'] = nspire['Shipping City'].str.upper() \
                    + ',' \
                    + nspire['Shipping State/Province'].str.upper()

Remove rows with no CITYSTATE value, remove empty spaces around locality & state names, and standardize locality names.

In [98]:
nspire = nspire[nspire.CITYSTATE.notna()]
nspire['CITYSTATE'] = nspire.copy().CITYSTATE.apply(remove_empty_space)
nspire['CITYSTATE'] = nspire.copy().CITYSTATE.apply(standardize_names)

Get number of inspections per city.

In [99]:
inspections_per_city = nspire \
    .groupby(by=['Shipping City', 'Inspection ID']) \
    .size() \
    .reset_index()
per_city = inspections_per_city.groupby(by='Shipping City')['Inspection ID'].count()

On average, there are only 2.33 NSPIRE inspections per city.

In [100]:
per_city.describe()

count    1275.000000
mean        2.334118
std         4.624661
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        95.000000
Name: Inspection ID, dtype: float64

In [101]:
fire_standards = [
    "Carbon Monoxide Alarm",
    "Leak",
    "Chimney",
    "Door - Fire Labeled",
    "Electrical - Outlet and Switch",
    "Electrical - Conductor",
    "Electrical - Service Panel",
    "Electrical - GFCI or AFCI Outlet or Breaker",
    "Fire Escape",
    "Fire Extinguisher",
    "Flammable and Combustible Item",
    "Smoke Alarm",
    "Heating, Ventilation, and Air Conditioning (HVAC)",
    "Kitchen Ventilation",
    "Structural System"
]

In [102]:
for standard in fire_standards:
    nspire = pd.merge(
        nspire, 
        nspire[nspire['NSPIRE Standards'] == standard] \
            .groupby(by='CITYSTATE') \
            .size() \
            .reset_index(),
        on='CITYSTATE',
        how='left'
    )
    nspire = nspire.rename(columns={nspire.columns[-1]: 'Count ' + standard})

In [103]:
new_col_names = ['Count ' + s for s in fire_standards]
nspire_filter = nspire.loc[:, ['CITYSTATE'] + new_col_names]
nspire = nspire_filter \
    .groupby(by='CITYSTATE') \
    .mean() \
    .reset_index() 

# Merge datasets

Our NFIRS data has year values, so we'll need to combine CITYSTATE and YEAR to get a column to merge on. For population, we'll create a column using duplicate values for each year (e.g. NEW YORK,NY,2013 and NEW YORK,NY,2020 will have the same population)

In [104]:
nfirs_loss['CITYSTATE_YEAR'] = nfirs_loss.CITYSTATE.str.upper() + ',' + nfirs_loss.YEAR.astype(str)
nfirs_spread['CITYSTATE_YEAR'] = nfirs_spread.CITYSTATE.str.upper() + ',' + nfirs_spread.YEAR.astype(str)
nfirs_counts['CITYSTATE_YEAR'] = nfirs_counts.CITYSTATE.str.upper() + ',' + nfirs_counts.YEAR.astype(str)
nfirs_top5['CITYSTATE_YEAR'] = nfirs_top5.CITYSTATE.str.upper() + ',' + nfirs_top5.YEAR.astype(str)
pop_df = pops.copy(deep=True)
for year in nfirs_loss.YEAR.unique():
    pop_df_sub = pops.copy(deep=True)
    pop_df_sub['CITYSTATE_YEAR'] = pop_df_sub.CITYSTATE.str.upper() + ',' + str(year)
    pop_df = pd.concat([pop_df, pop_df_sub], axis=0)

In [106]:
reac_cities = reac_13_18.CITYSTATE.unique()

In [107]:
# merged = pd.merge(left=nspire, right=nfirs_loss, on='CITYSTATE', how='outer')
# merged = pd.merge(left=merged, right=nfirs_spread, on='CITYSTATE', how='outer')
# merged = pd.merge(left=merged, right=nfirs_counts, on='CITYSTATE', how='outer')
# merged = pd.merge(left=merged, right=nfirs_top5, on='CITYSTATE', how='outer')
# merged = pd.merge(left=merged, right=pops, on='CITYSTATE', how='left')

merged = pd.merge(left=nfirs_loss, right=nfirs_spread, on='CITYSTATE_YEAR', how='inner')
merged = pd.merge(left=merged, right=nfirs_counts, on='CITYSTATE_YEAR', how='inner')
merged = pd.merge(left=merged, right=nfirs_top5, on='CITYSTATE_YEAR', how='inner')
merged = pd.merge(left=merged, right=pop_df, on='CITYSTATE_YEAR', how='inner')

# Filter by cities that have some public housing
merged = merged[merged.CITYSTATE.isin(reac_cities)]

  merged = pd.merge(left=merged, right=nfirs_top5, on='CITYSTATE_YEAR', how='inner')


In [109]:
merged.to_csv('../data/processed/nfirs_pop_merge_filter_reac.csv', index=False, sep=',')