In [31]:
from collections import defaultdict

import numpy as np
import pandas as pd

In [32]:
geog = pd.read_csv('derived_data/geography.csv')

In [33]:
df_state = pd.read_csv('raw_data/states_fips.csv', sep=", ", engine='python')

# read in each states values + standardize naming
dfs = []
for state, fips, abv in df_state.values:
    df_crime = pd.read_excel(f'raw_data/fbi_cde/stateTables/{state.replace(" ", "_")}_Offense_Type_by_Agency_2023.xlsx', 
                             skiprows=5, skipfooter=2, header=None)
    df_crime = df_crime.loc[:, :6].rename(columns={
        0:"AGENCY", 1:"LOCATION", 2:"POPULATION", 3:"Total Offenses", 
        4:"Crimes Against Persons",	5:"Crimes Against Property", 
        6:"Crimes Against Society"
    })
    df_crime["AGENCY"] = df_crime["AGENCY"].ffill()
    df_crime['STATENAME'] = state
    df_crime['STATE'] = fips
    df_crime['STATEABV'] = abv
    df_crime["LOCATION"] = df_crime["LOCATION"].str.lower()
    dfs.append(df_crime)

df_crime = pd.concat(dfs)

# Limit to city + county data
df_crime = df_crime[df_crime["AGENCY"].isin(["Cities", "Metropolitan Counties", "Nonmetropolitan Counties"])]

In [34]:
# we have city level and county level. county level does not contains city level
# FBI CDE does have city level population estimates, but not county
# need to subtract population from county

crime_cities = {(abv, city): pop for abv, city, pop in df_crime.loc[df_crime["AGENCY"] == "Cities", ['STATEABV', "LOCATION", "POPULATION"]].values}
crime_counties = {(abv, city) for abv, city 
                  in df_crime.loc[df_crime["AGENCY"].isin(["Metropolitan Counties", "Nonmetropolitan Counties"]), ['STATEABV', "LOCATION"]].values}

city2county = geog[["CITYNAME", "COUNTYNAME", "STATEABV", "COUNTY"]].drop_duplicates()

diffs = defaultdict(int)

for city, county, state in city2county[["CITYNAME", "COUNTYNAME", "STATEABV"]].drop_duplicates().values:
    city = city.lower()
    if (state, city) in crime_cities:
        diffs[(state, county)] += crime_cities[(state, city)]

diffs = pd.DataFrame([[st, county.lower(), x]for (st, county), x in diffs.items()],
                     columns = ["STATEABV", "LOCATION", "diff"])

df_crime = df_crime.merge(diffs, on=["STATEABV", "LOCATION"], how="left")
df_crime['diff'].fillna(0, inplace=True)

In [35]:
# us cencus county population estimate
df_pop = pd.read_csv('raw_data/census/census_data_county_raw.csv')[["ucgid", "B01001_001E"]]
df_pop["COUNTY"] = [int(v.split("US")[1]) for v in df_pop["ucgid"]]

df_pop = city2county[["COUNTY", "COUNTYNAME", "STATEABV"]].merge(df_pop, on="COUNTY")
df_pop = df_pop.rename(columns={"B01001_001E": "COUNTYPOP", "COUNTYNAME": "LOCATION"})
df_pop["LOCATION"] = df_pop["LOCATION"].str.lower()
df_pop = df_pop.drop_duplicates()

# merge county estimates
df_crime = df_crime.merge(df_pop, on=["LOCATION", "STATEABV"], how='left')

# update estimate
df_crime['POP_EST'] = df_crime["COUNTYPOP"] - df_crime["diff"]
m = df_crime['POP_EST'] <= 0
df_crime.loc[m, 'POP_EST'] = np.nan

In [36]:
df_crime["POPULATION"] = df_crime["POPULATION"].combine_first(df_crime["POP_EST"])

In [37]:
# calculate rates with completed pops
df_crime['Total Offenses Rate'] = df_crime['Total Offenses'] / df_crime["POPULATION"]
df_crime['Crimes Against Persons Rate'] = df_crime['Crimes Against Persons'] / df_crime["POPULATION"]
df_crime['Crimes Against Property Rate'] = df_crime['Crimes Against Property'] / df_crime["POPULATION"]
df_crime['Crimes Against Society Rate'] = df_crime['Crimes Against Society'] / df_crime["POPULATION"]

In [48]:
df_crime = df_crime[['AGENCY', "LOCATION", 
                     'Total Offenses', 'Crimes Against Persons', 'Crimes Against Property', 'Crimes Against Society', "POPULATION",
                     'Total Offenses Rate', 'Crimes Against Persons Rate', 'Crimes Against Property Rate', 'Crimes Against Society Rate', 
                     'STATENAME', 'STATE', 'STATEABV']]

cols = ["STATE", "LOCATION", "POPULATION",
        'Total Offenses', 'Crimes Against Persons', 'Crimes Against Property', 'Crimes Against Society',
        'Total Offenses Rate', 'Crimes Against Persons Rate', 'Crimes Against Property Rate', 'Crimes Against Society Rate']
m = df_crime["AGENCY"] == "Cities"

crime_data = pd.concat([
    geog.merge(df_crime.loc[m, cols], left_on=["STATE", "CITYNAME"], right_on=["STATE", "LOCATION"]),
    geog.merge(df_crime.loc[~m, cols], left_on=["STATE", "COUNTYNAME"], right_on=["STATE", "LOCATION"]),
]).dropna()

# cities are first drop duplicate idxs for county data to defer to city level
crime_data = crime_data.drop_duplicates(["TRACT", "ZIP", "COUNTY"], keep='first').reset_index(drop=True)

In [49]:
# now merge with cenus counties + zcta + tracts

# zcta: if zip in city, use city else county else NA
# cities are first
crime_data_zcta = crime_data[["ZIP", 'Total Offenses Rate', 'Crimes Against Persons Rate', 
                              'Crimes Against Property Rate', 'Crimes Against Society Rate']] \
                            .drop_duplicates("ZIP", keep='first')

In [51]:
gb = crime_data.groupby("TRACT")
data = []
for tid, sub_df in gb:
    sub_df = sub_df.dropna()
    den = sum(sub_df["TZ_RATIO"])
    if den > 0:
        data.append([tid, 
                     sum(sub_df['Total Offenses Rate'] * sub_df["TZ_RATIO"]) / den,
                     sum(sub_df['Crimes Against Persons Rate'] * sub_df["TZ_RATIO"]) / den,
                     sum(sub_df['Crimes Against Property Rate'] * sub_df["TZ_RATIO"]) / den,
                     sum(sub_df['Crimes Against Society Rate'] * sub_df["TZ_RATIO"]) / den])

crime_data_tract = pd.DataFrame(data, columns=["TRACT", 'Total Offenses Rate', 'Crimes Against Persons Rate', 
                              'Crimes Against Property Rate', 'Crimes Against Society Rate'])

In [56]:
#county2city = {county: set(sub_df["CITYNAME"]) for county, sub_df in geog.groupby("COUNTY")}

crime_data_county = crime_data[["STATE", "COUNTY", "COUNTYNAME",
            'Total Offenses', 'Crimes Against Persons', 
            'Crimes Against Property', 'Crimes Against Society']] \
          .drop_duplicates().groupby(["STATE", "COUNTY", "COUNTYNAME"]).sum() \
          .reset_index() \
          .merge(df_pop[["COUNTY", "COUNTYPOP"]], on="COUNTY")

crime_data_county['Total Offenses Rate'] = crime_data_county['Total Offenses'] / crime_data_county["COUNTYPOP"]
crime_data_county['Crimes Against Persons Rate'] = crime_data_county['Crimes Against Persons'] / crime_data_county["COUNTYPOP"]
crime_data_county['Crimes Against Property Rate'] = crime_data_county['Crimes Against Property'] / crime_data_county["COUNTYPOP"]
crime_data_county['Crimes Against Society Rate'] = crime_data_county['Crimes Against Society'] / crime_data_county["COUNTYPOP"]

crime_data_county = crime_data_county[["STATE", "COUNTY", "COUNTYNAME",
                              'Total Offenses Rate', 'Crimes Against Persons Rate', 
                              'Crimes Against Property Rate', 'Crimes Against Society Rate']] \
                            .dropna()

In [58]:
crime_data_county.to_csv("derived_data/county/fbi_cde.csv", index=False)
crime_data_zcta.to_csv("derived_data/zcta/fbi_cde.csv", index=False)
crime_data_tract.to_csv("derived_data/tract/fbi_cde.csv", index=False)