In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

pd.set_option('display.max_columns', None)

# Standardize placenames

In [2]:
reac_fixed = pd.read_csv('../data/processed/reac_fixed.csv')
nfirs_fixed = pd.read_csv('../data/processed/nfirs_fixed.csv')
nspire_fixed = pd.read_csv('../data/processed/nspire_fixed.csv')

In [3]:
fix_dict = {}

reac_fixes = reac_fixed[reac_fixed.citystate != reac_fixed.corrected_address] \
    .loc[:, ['corrected_address', 'citystate']] \
    .values.tolist()
for fix in reac_fixes:
    fix_dict[fix[1]] = fix[0]

nfirs_fixes = nfirs_fixed[nfirs_fixed.citystate != nfirs_fixed.corrected_address] \
    .loc[:, ['corrected_address', 'citystate']] \
    .values.tolist()
for fix in nfirs_fixes:
    fix_dict[fix[1]] = fix[0]

nspire_fixes = nspire_fixed[nspire_fixed.citystate != nspire_fixed.corrected_address] \
    .loc[:, ['corrected_address', 'citystate']] \
    .values.tolist()
for fix in nspire_fixes:
    fix_dict[fix[1]] = fix[0]

### Standardize REAC

In [4]:
reac_13_18 = pd.read_csv('../data/processed/reac_13-18.csv')

In [5]:
reac_13_18['CITYSTATE'] = reac_13_18.CITYSTATE.apply(
    lambda x: fix_dict[x]
    if x in fix_dict
    else x
)

In [6]:
# Calculate the average latitude and longitude for each CITYSTATE
avg_coordinates = reac_13_18 \
    .loc[:, ['CITYSTATE', 'LATITUDE', 'LONGITUDE']] \
    .groupby('CITYSTATE') \
    .mean() \
    .reset_index()

# Calculate the average inspection scores for each INSPECTION_TYPE
avg_scores = reac_13_18 \
    .loc[:, ['CITYSTATE', 'INSPECTION_SCORE', 'INSPECTION_TYPE']] \
    .groupby(['CITYSTATE', 'INSPECTION_TYPE']) \
    .mean() \
    .reset_index() \
    .pivot(index='CITYSTATE', columns='INSPECTION_TYPE', values='INSPECTION_SCORE') \
    .reset_index() \
    .rename(columns={
        'MULTIFAMILY': 'AVG_SCORE_MULTIFAMILY',
        'PUBLIC': 'AVG_SCORE_PUBLIC'
    }) \
    .rename_axis(None, axis=1)

# Join the two DataFrames together on the CITYSTATE column
reac_13_18 = avg_coordinates.merge(avg_scores, on='CITYSTATE')

### Standardize NFIRS

For NFIRS data, the data is aggregated at the level of city/locality/town/etc, so if we standardize a name we need to combine metrics relying on an average with a weighted average.

In [7]:
nfirs_loss = pd.read_csv('../data/processed/nfirs_loss_13_18.csv')
nfirs_spread = pd.read_csv('../data/processed/nfirs_spread_13_18.csv')
nfirs_top5 = pd.read_csv('../data/processed/top5_by_type_13_18.csv')
nfirs_counts = pd.read_csv('../data/processed/total_incident_count_13_18.csv')

Remove rows without a locality name.

In [8]:
nfirs_loss = nfirs_loss[nfirs_loss.CITYSTATE.notna()]
nfirs_spread = nfirs_spread[nfirs_spread.CITYSTATE.notna()]
nfirs_top5 = nfirs_top5[nfirs_top5.CITYSTATE.notna()]
nfirs_counts = nfirs_counts[nfirs_counts.CITYSTATE.notna()]

Remove empty spaces.

In [9]:
def remove_empty_space(x):
    return str(x).strip().replace(', ', ',').replace(' ,', ',')

nfirs_loss['CITYSTATE'] = nfirs_loss.CITYSTATE.apply(remove_empty_space)
nfirs_spread['CITYSTATE'] = nfirs_spread.CITYSTATE.apply(remove_empty_space)
nfirs_top5['CITYSTATE'] = nfirs_top5.CITYSTATE.apply(remove_empty_space)
nfirs_counts['CITYSTATE'] = nfirs_counts.CITYSTATE.apply(remove_empty_space)

Standardize non-standard names.

In [10]:
def standardize_names(x):
    if x in fix_dict:
        return fix_dict[x]
    return x

nfirs_loss['CITYSTATE'] = nfirs_loss.CITYSTATE.apply(standardize_names)
nfirs_spread['CITYSTATE'] = nfirs_spread.CITYSTATE.apply(standardize_names)
nfirs_top5['CITYSTATE'] = nfirs_top5.CITYSTATE.apply(standardize_names)
nfirs_counts['CITYSTATE'] = nfirs_counts.CITYSTATE.apply(standardize_names)

Aggregate the summed values in the DataFrames at the locality level.

In [11]:
nfirs_loss = nfirs_loss.groupby(by='CITYSTATE').sum().reset_index()
nfirs_spread = nfirs_spread.groupby(by='CITYSTATE').sum().reset_index()
nfirs_counts = nfirs_counts.groupby(by='CITYSTATE').sum().reset_index()
nfirs_top5 = nfirs_top5.groupby(by='CITYSTATE').sum().reset_index()

### Standardize Census Populations

In [12]:
pops = pd.read_csv('../data/processed/populations_clean.csv')

In [13]:
max_sumlev = pops.groupby('CITYSTATE')['SUMLEV'].transform(max)
pops = pops[pops['SUMLEV'] == max_sumlev]

In [14]:
pops = pops.drop_duplicates(subset=['CITYSTATE'], keep=False)

In [15]:
pops = pops.loc[:, ['CITYSTATE', 'POPULATION']]

### Standardize NSPIRE

In [16]:
nspire = pd.read_csv('../data/processed/nspire_partial_clean.csv',
                     low_memory=False)

In [17]:
nspire['CITYSTATE'] = nspire['Shipping City'].str.upper() \
                    + ',' \
                    + nspire['Shipping State/Province'].str.upper()

Remove rows with no CITYSTATE value, remove empty spaces around locality & state names, and standardize locality names.

In [18]:
nspire = nspire[nspire.CITYSTATE.notna()]
nspire['CITYSTATE'] = nspire.copy().CITYSTATE.apply(remove_empty_space)
nspire['CITYSTATE'] = nspire.copy().CITYSTATE.apply(standardize_names)

Get number of inspections per city.

In [19]:
inspections_per_city = nspire \
    .groupby(by=['Shipping City', 'Inspection ID']) \
    .size() \
    .reset_index()
per_city = inspections_per_city.groupby(by='Shipping City')['Inspection ID'].count()

On average, there are only 2.33 NSPIRE inspections per city.

In [20]:
per_city.describe()

count    1275.000000
mean        2.334118
std         4.624661
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        95.000000
Name: Inspection ID, dtype: float64

In [21]:
fire_standards = [
    "Carbon Monoxide Alarm",
    "Leak",
    "Chimney",
    "Door - Fire Labeled",
    "Electrical - Outlet and Switch",
    "Electrical - Conductor",
    "Electrical - Service Panel",
    "Electrical - GFCI or AFCI Outlet or Breaker",
    "Fire Escape",
    "Fire Extinguisher",
    "Flammable and Combustible Item",
    "Smoke Alarm",
    "Heating, Ventilation, and Air Conditioning (HVAC)",
    "Kitchen Ventilation",
    "Structural System"
]

In [22]:
for standard in fire_standards:
    nspire = pd.merge(
        nspire, 
        nspire[nspire['NSPIRE Standards'] == standard] \
            .groupby(by='CITYSTATE') \
            .size() \
            .reset_index(),
        on='CITYSTATE',
        how='left'
    )
    nspire = nspire.rename(columns={nspire.columns[-1]: 'Count ' + standard})

In [23]:
new_col_names = ['Count ' + s for s in fire_standards]
nspire_filter = nspire.loc[:, ['CITYSTATE'] + new_col_names]
nspire = nspire_filter \
    .groupby(by='CITYSTATE') \
    .mean() \
    .reset_index() 

# Merge datasets

In [24]:
merged = pd.merge(left=nspire, right=nfirs_loss, on='CITYSTATE', how='outer')
merged = pd.merge(left=merged, right=nfirs_spread, on='CITYSTATE', how='outer')
merged = pd.merge(left=merged, right=nfirs_counts, on='CITYSTATE', how='outer')
merged = pd.merge(left=merged, right=nfirs_top5, on='CITYSTATE', how='outer')
merged = pd.merge(left=merged, right=pops, on='CITYSTATE', how='left')

In [25]:
merged.to_csv('../data/processed/nspire_nfirs_merge.csv', index=False, sep=',')