# Hate Crime Data Extraction and Transformation <a name="top"/>

1. <a href=#extract>Data Extraction</a> - Import data from csv and clean for transformation.
2. <a href=#transform>Data Transformation</a> - Create dataframes for database tables and export to csv files.

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import requests
from pathlib import Path

## Data Extraction <a name="extract"/> <a href=#top style="text-decoration: none;"><abbr title="Back to Start">▲</abbr></a>

In [2]:
# Only run this code if you've downloaded the csv file directly from FBI website at
#    https://cde.ucr.cjis.gov/LATEST/webapp/#/pages/downloads#datasets
#    The csv file is not in the github repo given it is greater than size limit of 50mg

# path = Path('source_data/fbi_hate_crime_data.csv')
# crime_data_df = pd.read_csv(path)

In [3]:
# Extract hate crime data from xlxs
path = Path('source_data/fbi_hate_crime_data.xlsx')
crime_data_df = pd.read_excel(path)

In [4]:
# Display sample data
print(len(crime_data_df))
crime_data_df.head()

226328


Unnamed: 0,incident_id,data_year,ori,pug_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,region_name,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
0,43,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,...,Black or African American,Not Specified,1,Aggravated Assault,1.0,Residence/Home,Anti-Black or African American,Individual,S,S
1,44,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,...,Black or African American,Not Specified,2,Aggravated Assault;Destruction/Damage/Vandalis...,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,M,S
2,45,1991,AR0600300,North Little Rock,,City,AR,Arkansas,West South Central,South,...,Black or African American,Not Specified,2,Aggravated Assault;Murder and Nonnegligent Man...,2.0,Residence/Home,Anti-White,Individual,M,S
3,46,1991,AR0600300,North Little Rock,,City,AR,Arkansas,West South Central,South,...,Black or African American,Not Specified,1,Intimidation,1.0,Residence/Home,Anti-White,Individual,S,S
4,47,1991,AR0670000,Sevier,,County,AR,Arkansas,West South Central,South,...,White,Not Specified,1,Intimidation,1.0,School/College,Anti-Black or African American,Individual,S,S


In [5]:
crime_data_df.dtypes

incident_id                              int64
data_year                                int64
ori                                     object
pug_agency_name                         object
pub_agency_unit                         object
agency_type_name                        object
state_abbr                              object
state_name                              object
division_name                           object
region_name                             object
population_group_code                   object
population_group_description            object
incident_date                   datetime64[ns]
adult_victim_count                     float64
juvenile_victim_count                  float64
total_offender_count                     int64
adult_offender_count                   float64
juvenile_offender_count                float64
offender_race                           object
offender_ethnicity                      object
victim_count                             int64
offense_name 

## Data Transformation<a name="transform"/> <a href=#top style="text-decoration: none;"><abbr title="Back to Start">▲</abbr></a>

<a href=#data_cleaning>Initial Data Cleaning</a>

<a href=#data_org>Data Organization and Normalization</a>
- <a href=#incident_table>Incident table</a>
- <a href=#jurisdiction_table>Jurisdiction table</a>
- <a href=#state_table>State table</a>
- <a href=#race_table>Race Table</a>
- <a href=#bias_tables>Bias Tables</a>
- <a href=#offense_tables>Offense Tables</a>
- <a href=#victim_tables>Victim Tables</a>
- <a href=#location_table>Location Table</a>

### Initial Data Cleaning<a name="data_cleaning"/> <a href=#transform style="text-decoration: none;"><abbr title="Back to Data Transformation">▲</abbr></a>

In [6]:
# Remove data before 2009
crime_data_df = crime_data_df[crime_data_df['data_year'] > 2008]

# Remove data for 2021 due to underreported incidents from transition to NIBRS-only crime data collection 
crime_data_df = crime_data_df[crime_data_df['data_year'] < 2021]

# Display sample data
print(len(crime_data_df))
crime_data_df.head()

82102


Unnamed: 0,incident_id,data_year,ori,pug_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,region_name,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
136923,141003,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,...,White,Not Specified,1,Simple Assault,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
136924,141004,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,...,Unknown,Not Specified,1,Intimidation,1.0,Residence/Home,"Anti-Lesbian, Gay, Bisexual, or Transgender (M...",Individual,S,S
136925,141005,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,...,White,Not Specified,1,Robbery,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
136926,141006,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,...,Multiple,Not Specified,4,Aggravated Assault,4.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S
136927,141007,2009,AK0010100,Anchorage,,City,AK,Alaska,Pacific,West,...,Asian,Not Specified,1,Robbery,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual,S,S


In [7]:
# Check for columns with missing data
crime_data_df.isnull().sum()

incident_id                         0
data_year                           0
ori                                 0
pug_agency_name                     0
pub_agency_unit                 79350
agency_type_name                    0
state_abbr                          0
state_name                          0
division_name                       0
region_name                         0
population_group_code             287
population_group_description      287
incident_date                       0
adult_victim_count              31203
juvenile_victim_count           33075
total_offender_count                0
adult_offender_count            33826
juvenile_offender_count         33833
offender_race                       0
offender_ethnicity                  0
victim_count                        0
offense_name                        0
total_individual_victims         3479
location_name                       0
bias_desc                           0
victim_types                        0
multiple_off

In [8]:
# Drop columns with missing data
crime_data_df.drop(columns=['pub_agency_unit','population_group_code','population_group_description',
                            'adult_victim_count', 'juvenile_victim_count', 'adult_offender_count',
                            'juvenile_offender_count', 'total_individual_victims'], inplace=True)

# Drop unnecessary columns
crime_data_df.drop(columns=['ori','pug_agency_name','offender_ethnicity', 'multiple_offense', 'multiple_bias'],
                   inplace=True)

# Display dataframe
crime_data_df.head()

Unnamed: 0,incident_id,data_year,agency_type_name,state_abbr,state_name,division_name,region_name,incident_date,total_offender_count,offender_race,victim_count,offense_name,location_name,bias_desc,victim_types
136923,141003,2009,City,AK,Alaska,Pacific,West,2009-02-17,1,White,1,Simple Assault,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual
136924,141004,2009,City,AK,Alaska,Pacific,West,2009-06-30,0,Unknown,1,Intimidation,Residence/Home,"Anti-Lesbian, Gay, Bisexual, or Transgender (M...",Individual
136925,141005,2009,City,AK,Alaska,Pacific,West,2009-08-05,2,White,1,Robbery,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual
136926,141006,2009,City,AK,Alaska,Pacific,West,2009-08-28,3,Multiple,4,Aggravated Assault,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual
136927,141007,2009,City,AK,Alaska,Pacific,West,2009-08-28,2,Asian,1,Robbery,Highway/Road/Alley/Street/Sidewalk,Anti-American Indian or Alaska Native,Individual


In [9]:
# Check for duplicate incidents
crime_data_df[crime_data_df['incident_id'].duplicated(keep=False)]

Unnamed: 0,incident_id,data_year,agency_type_name,state_abbr,state_name,division_name,region_name,incident_date,total_offender_count,offender_race,victim_count,offense_name,location_name,bias_desc,victim_types


In [10]:
# Remove row with unknown bias (discovered when creating bias tables)
drop_df = crime_data_df[crime_data_df['bias_desc'] == "Unknown (offender's motivation not known)"]
crime_data_df.drop(drop_df.index, inplace = True)

# Display rows removed
drop_df

Unnamed: 0,incident_id,data_year,agency_type_name,state_abbr,state_name,division_name,region_name,incident_date,total_offender_count,offender_race,victim_count,offense_name,location_name,bias_desc,victim_types
199756,373423,2018,City,PA,Pennsylvania,Middle Atlantic,Northeast,2018-11-23,1,White,1,Destruction/Damage/Vandalism of Property,Highway/Road/Alley/Street/Sidewalk,Unknown (offender's motivation not known),Government


In [11]:
# Remove rows where state is federal government or Guam
drop_df = crime_data_df[(crime_data_df['state_abbr'] == 'FS') | (crime_data_df['state_abbr'] == 'GM')]
crime_data_df.drop(drop_df.index, inplace = True)

# Display rows removed
drop_df

Unnamed: 0,incident_id,data_year,agency_type_name,state_abbr,state_name,division_name,region_name,incident_date,total_offender_count,offender_race,victim_count,offense_name,location_name,bias_desc,victim_types
176223,180587,2015,Other,GM,Guam,U.S. Territories,U.S. Territories,2015-06-14,1,Native Hawaiian or Other Pacific Islander,3,Aggravated Assault,Arena/Stadium/Fairgrounds/Coliseum,Anti-Native Hawaiian or Other Pacific Islander,Individual
188763,1436756,2017,Federal,FS,Federal,Other,Other,2017-09-18,1,White,1,Intimidation,Other/Unknown,Anti-Other Race/Ethnicity/Ancestry,Individual
188764,1436759,2017,Federal,FS,Federal,Other,Other,2017-06-01,1,White,1,Simple Assault,Highway/Road/Alley/Street/Sidewalk,Anti-Black or African American,Individual
188765,1436760,2017,Federal,FS,Federal,Other,Other,2017-11-29,0,Not Specified,1,Destruction/Damage/Vandalism of Property,Park/Playground,"Anti-Multiple Races, Group",Government
195963,410170,2018,Federal,FS,Federal,Other,Other,2018-03-21,0,Unknown,1,Intimidation,Commercial/Office Building,Anti-Black or African American,Individual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211685,1448539,2020,Federal,FS,Federal,Other,Other,2020-11-07,1,White,2,Intimidation,Residence/Home,Anti-Jewish,Individual
211686,1449206,2020,Federal,FS,Federal,Other,Other,2020-03-29,1,White,1,Aggravated Assault,Residence/Home,Anti-Black or African American;Anti-Transgender,Individual
211687,1449242,2020,Federal,FS,Federal,Other,Other,2020-11-28,1,Black or African American,1,Intimidation,Residence/Home,Anti-Black or African American,Individual
211688,1449316,2020,Federal,FS,Federal,Other,Other,2020-12-28,1,White,1,Arson,Church/Synagogue/Temple/Mosque,Anti-Black or African American,Religious Organization


### Data Organization and Normalization<a name="data_org"/> <a href=#transform style="text-decoration: none;"><abbr title="Back to Data Transformation">▲</abbr></a>
The data was normalized to create a dataframe for each of the following tables to load into the database.  

### Incident Table<a name="incident_table"/> <a href=#transform style="text-decoration: none;"><abbr title="Back to Data Transformation">▲</abbr></a>

In [12]:
# Create copy of crime dataframe for incidents
incidents_df = crime_data_df[['incident_id', 'data_year', 'incident_date', 'agency_type_name',
                              'state_abbr', 'state_name', 'division_name','region_name',
                              'total_offender_count', 'offender_race', 'victim_count']].copy()

# Display dataframe
incidents_df

Unnamed: 0,incident_id,data_year,incident_date,agency_type_name,state_abbr,state_name,division_name,region_name,total_offender_count,offender_race,victim_count
136923,141003,2009,2009-02-17,City,AK,Alaska,Pacific,West,1,White,1
136924,141004,2009,2009-06-30,City,AK,Alaska,Pacific,West,0,Unknown,1
136925,141005,2009,2009-08-05,City,AK,Alaska,Pacific,West,2,White,1
136926,141006,2009,2009-08-28,City,AK,Alaska,Pacific,West,3,Multiple,4
136927,141007,2009,2009-08-28,City,AK,Alaska,Pacific,West,2,Asian,1
...,...,...,...,...,...,...,...,...,...,...,...
219020,1440762,2020,2020-12-05,County,WY,Wyoming,Mountain,West,1,White,1
219021,1441769,2020,2020-11-07,City,WY,Wyoming,Mountain,West,0,Not Specified,1
219022,1442136,2020,2020-05-10,City,WY,Wyoming,Mountain,West,1,White,1
219023,1444656,2020,2020-10-16,City,WY,Wyoming,Mountain,West,1,Black or African American,1


### Jurisdication Table<a name="jurisdiction_table"/> <a href=#transform style="text-decoration: none;"><abbr title="Back to Data Transformation">▲</abbr></a>

In [13]:
jurisdiction_df = pd.DataFrame(crime_data_df['agency_type_name'].unique(), columns=['jurisdiction'])
jurisdiction_df['jurisdiction_id'] = jurisdiction_df.index
jurisdiction_df = jurisdiction_df[['jurisdiction_id', 'jurisdiction']]

# Display dataframe
jurisdiction_df

Unnamed: 0,jurisdiction_id,jurisdiction
0,0,City
1,1,County
2,2,University or College
3,3,Other
4,4,Other State Agency
5,5,State Police
6,6,Tribal
7,7,Federal


In [14]:
# Reorder jurisdictions
jurisdiction_df['jurisdiction_id'].replace({1:1, 2:5, 3:7, 4:3, 5:2, 6:6, 7:4}, inplace=True)
jurisdiction_df = jurisdiction_df.sort_values('jurisdiction_id')

# Determine maximum column size for database schema
# Code Reference: https://stackoverflow.com/questions/21295334/find-length-of-longest-string-in-pandas-dataframe-column
print(f"Max column size - jurisdiction: {jurisdiction_df['jurisdiction'].map(lambda x: len(x)).max()}")

# Display dataframe
jurisdiction_df

Max column size - jurisdiction: 21


Unnamed: 0,jurisdiction_id,jurisdiction
0,0,City
1,1,County
5,2,State Police
4,3,Other State Agency
7,4,Federal
2,5,University or College
6,6,Tribal
3,7,Other


In [15]:
# Export dataframe to csv 
jurisdiction_df.to_csv('transformed_data/jurisdiction.csv', index=False)

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


In [16]:
# Add jurisdiction_id to incidents dataframe 
incidents_v1_df = incidents_df.merge(jurisdiction_df, left_on='agency_type_name', right_on='jurisdiction', how='left')
incidents_v1_df.drop(columns=['agency_type_name', 'jurisdiction'], inplace=True)

# Display dataframe
incidents_v1_df

Unnamed: 0,incident_id,data_year,incident_date,state_abbr,state_name,division_name,region_name,total_offender_count,offender_race,victim_count,jurisdiction_id
0,141003,2009,2009-02-17,AK,Alaska,Pacific,West,1,White,1,0
1,141004,2009,2009-06-30,AK,Alaska,Pacific,West,0,Unknown,1,0
2,141005,2009,2009-08-05,AK,Alaska,Pacific,West,2,White,1,0
3,141006,2009,2009-08-28,AK,Alaska,Pacific,West,3,Multiple,4,0
4,141007,2009,2009-08-28,AK,Alaska,Pacific,West,2,Asian,1,0
...,...,...,...,...,...,...,...,...,...,...,...
81661,1440762,2020,2020-12-05,WY,Wyoming,Mountain,West,1,White,1,1
81662,1441769,2020,2020-11-07,WY,Wyoming,Mountain,West,0,Not Specified,1,0
81663,1442136,2020,2020-05-10,WY,Wyoming,Mountain,West,1,White,1,0
81664,1444656,2020,2020-10-16,WY,Wyoming,Mountain,West,1,Black or African American,1,0


### State Table <a name="state_table"/> <a href=#transform style="text-decoration: none;"><abbr title="Back to Data Transformation">▲</abbr></a>

In [17]:
# Create dataframe for states
state_df = incidents_df.loc[:, ['state_abbr', 'state_name', 'division_name','region_name']
                             ].groupby('state_abbr').first()
state_df = state_df.rename(columns={'state_name': 'state', 'division_name': 'division',
                                              'region_name': 'region'})

# Determine maximum column size for database schema
print(f"Max column size - state_name: {state_df['state'].map(lambda x: len(x)).max()}")
print(f"Max column size - division_name: {state_df['division'].map(lambda x: len(x)).max()}")
print(f"Max column size - region_name: {state_df['region'].map(lambda x: len(x)).max()}")

# Display dataframe
print(len(state_df))
state_df

Max column size - state_name: 20
Max column size - division_name: 18
Max column size - region_name: 9
51


Unnamed: 0_level_0,state,division,region
state_abbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,Alaska,Pacific,West
AL,Alabama,East South Central,South
AR,Arkansas,West South Central,South
AZ,Arizona,Mountain,West
CA,California,Pacific,West
CO,Colorado,Mountain,West
CT,Connecticut,New England,Northeast
DC,District of Columbia,South Atlantic,South
DE,Delaware,South Atlantic,South
FL,Florida,South Atlantic,South


In [18]:
# Export states dataframe to csv 
state_df.to_csv('transformed_data/state.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


In [19]:
# Drop state, division and region from incidents dataframe
incidents_v2_df = incidents_v1_df.drop(columns=['state_name', 'division_name','region_name'])

# Display dataframe
incidents_v2_df

Unnamed: 0,incident_id,data_year,incident_date,state_abbr,total_offender_count,offender_race,victim_count,jurisdiction_id
0,141003,2009,2009-02-17,AK,1,White,1,0
1,141004,2009,2009-06-30,AK,0,Unknown,1,0
2,141005,2009,2009-08-05,AK,2,White,1,0
3,141006,2009,2009-08-28,AK,3,Multiple,4,0
4,141007,2009,2009-08-28,AK,2,Asian,1,0
...,...,...,...,...,...,...,...,...
81661,1440762,2020,2020-12-05,WY,1,White,1,1
81662,1441769,2020,2020-11-07,WY,0,Not Specified,1,0
81663,1442136,2020,2020-05-10,WY,1,White,1,0
81664,1444656,2020,2020-10-16,WY,1,Black or African American,1,0


### Race Table <a name="race_table"/> <a href=#transform style="text-decoration: none;"><abbr title="Back to Data Transformation">▲</abbr></a>

In [20]:
# Create dataframe for race and rename index
race_df = pd.DataFrame(crime_data_df['offender_race'].unique(), columns=['race'])
race_df['race_id'] = race_df.index
race_df = race_df[['race_id', 'race']]

# Display dataframe
race_df

Unnamed: 0,race_id,race
0,0,White
1,1,Unknown
2,2,Multiple
3,3,Asian
4,4,American Indian or Alaska Native
5,5,Black or African American
6,6,Native Hawaiian or Other Pacific Islander
7,7,Not Specified


In [21]:
# Order rows to match US census ordering
race_df['race_id'].replace({5:1, 4:2, 6:4, 2:5, 1:6, 7:6}, inplace=True)
race_df = race_df.sort_values('race_id')

# Display dataframe
race_df

Unnamed: 0,race_id,race
0,0,White
5,1,Black or African American
4,2,American Indian or Alaska Native
3,3,Asian
6,4,Native Hawaiian or Other Pacific Islander
2,5,Multiple
1,6,Unknown
7,6,Not Specified


In [22]:
# Add race_id to dataframe 
incidents_v2_df = incidents_v1_df.merge(race_df, left_on='offender_race', right_on='race', how='left')
incidents_v2_df.drop(columns=['offender_race', 'race'], inplace=True)
incidents_v2_df.rename(columns={'race_id': 'offender_race_id'}, inplace=True)

# Display dataframe
incidents_v2_df

Unnamed: 0,incident_id,data_year,incident_date,state_abbr,state_name,division_name,region_name,total_offender_count,victim_count,jurisdiction_id,offender_race_id
0,141003,2009,2009-02-17,AK,Alaska,Pacific,West,1,1,0,0
1,141004,2009,2009-06-30,AK,Alaska,Pacific,West,0,1,0,6
2,141005,2009,2009-08-05,AK,Alaska,Pacific,West,2,1,0,0
3,141006,2009,2009-08-28,AK,Alaska,Pacific,West,3,4,0,5
4,141007,2009,2009-08-28,AK,Alaska,Pacific,West,2,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...
81661,1440762,2020,2020-12-05,WY,Wyoming,Mountain,West,1,1,1,0
81662,1441769,2020,2020-11-07,WY,Wyoming,Mountain,West,0,1,0,6
81663,1442136,2020,2020-05-10,WY,Wyoming,Mountain,West,1,1,0,0
81664,1444656,2020,2020-10-16,WY,Wyoming,Mountain,West,1,1,0,1


In [23]:
# Combine Unknown or Not Specified 
race_final_df = race_df.copy()
race_final_df.loc[1, 'race'] = 'Unknown or Not Specified'
race_final_df = race_final_df.drop([7])

# Determine maximum column size for database schema
print(f"Max column size - race_id: {race_df['race'].map(lambda x: len(x)).max()}")

# Display dataframe
race_final_df

Max column size - race_id: 41


Unnamed: 0,race_id,race
0,0,White
5,1,Black or African American
4,2,American Indian or Alaska Native
3,3,Asian
6,4,Native Hawaiian or Other Pacific Islander
2,5,Multiple
1,6,Unknown or Not Specified


In [24]:
# Export dataframe to csv 
race_final_df.to_csv('transformed_data/race.csv' , index=False)

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


In [25]:
incidents_final_df = incidents_v2_df.rename(columns={'total_offender_count': 'offender_count', 'data_year': 'incident_year'})
incidents_final_df = incidents_final_df[['incident_id', 'incident_year', 'incident_date', 'jurisdiction_id', 'state_abbr',
                    'offender_race_id', 'offender_count', 'victim_count']]
incidents_final_df

Unnamed: 0,incident_id,incident_year,incident_date,jurisdiction_id,state_abbr,offender_race_id,offender_count,victim_count
0,141003,2009,2009-02-17,0,AK,0,1,1
1,141004,2009,2009-06-30,0,AK,6,0,1
2,141005,2009,2009-08-05,0,AK,0,2,1
3,141006,2009,2009-08-28,0,AK,5,3,4
4,141007,2009,2009-08-28,0,AK,3,2,1
...,...,...,...,...,...,...,...,...
81661,1440762,2020,2020-12-05,1,WY,0,1,1
81662,1441769,2020,2020-11-07,0,WY,6,0,1
81663,1442136,2020,2020-05-10,0,WY,0,1,1
81664,1444656,2020,2020-10-16,0,WY,1,1,1


In [26]:
# Export dataframe to csv 
incidents_final_df.to_csv('transformed_data/incident.csv', index=False)

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Bias Tables <a name="bias_tables"/> <a href=#transform style="text-decoration: none;"><abbr title="Back to Data Transformation">▲</abbr></a>

In [27]:
# Create copy of crime dataframe for incident biases 
inc_bias_df = crime_data_df[['incident_id', 'bias_desc']].copy()
print(f'Number of records before explode: {len(inc_bias_df)}')

# Explode rows with multiple biases 
inc_bias_df.loc[:, 'bias_desc'] = inc_bias_df['bias_desc'].str.split(';')
inc_bias_df = inc_bias_df.explode('bias_desc', ignore_index=True)
inc_bias_df.rename(columns={'bias_desc' : 'bias'}, inplace=True)

# Display dataframe
print(f'Number of records after explode: {len(inc_bias_df)}')
inc_bias_df.head(10)

Number of records before explode: 81666
Number of records after explode: 82716


Unnamed: 0,incident_id,bias
0,141003,Anti-American Indian or Alaska Native
1,141004,"Anti-Lesbian, Gay, Bisexual, or Transgender (M..."
2,141005,Anti-American Indian or Alaska Native
3,141006,Anti-American Indian or Alaska Native
4,141007,Anti-American Indian or Alaska Native
5,141008,Anti-American Indian or Alaska Native
6,141009,Anti-American Indian or Alaska Native
7,141010,Anti-White
8,141011,Anti-Lesbian (Female)
9,136965,Anti-Black or African American


In [28]:
# Display biases
print(len(inc_bias_df['bias'].unique()))
inc_bias_df['bias'].unique()

34


array(['Anti-American Indian or Alaska Native',
       'Anti-Lesbian, Gay, Bisexual, or Transgender (Mixed Group)',
       'Anti-White', 'Anti-Lesbian (Female)',
       'Anti-Black or African American', 'Anti-Multiple Races, Group',
       'Anti-Gay (Male)', 'Anti-Jewish', 'Anti-Protestant',
       'Anti-Bisexual', 'Anti-Hispanic or Latino', 'Anti-Catholic',
       'Anti-Heterosexual', 'Anti-Atheism/Agnosticism',
       'Anti-Islamic (Muslim)', 'Anti-Mental Disability', 'Anti-Asian',
       'Anti-Other Race/Ethnicity/Ancestry', 'Anti-Other Religion',
       'Anti-Multiple Religions, Group', 'Anti-Physical Disability',
       'Anti-Gender Non-Conforming', 'Anti-Female', 'Anti-Transgender',
       'Anti-Native Hawaiian or Other Pacific Islander', 'Anti-Male',
       'Anti-Arab', "Anti-Jehovah's Witness",
       'Anti-Church of Jesus Christ', 'Anti-Buddhist', 'Anti-Sikh',
       'Anti-Other Christian', 'Anti-Hindu',
       'Anti-Eastern Orthodox (Russian, Greek, Other)'], dtype=object)

In [29]:
# Create dictionary with bias categories and biases
bias_dict = {'Race, Ethnicity or Ancestry': ['Anti-Black or African American', 'Anti-Hispanic or Latino', 'Anti-Arab',
                                             'Anti-Asian','Anti-American Indian or Alaska Native',
                                             'Anti-Native Hawaiian or Other Pacific Islander', 'Anti-White',
                                             'Anti-Other Race/Ethnicity/Ancestry', 'Anti-Multiple Races, Group'],
             'Religion': ['Anti-Jewish', 'Anti-Protestant', 'Anti-Other Religion','Anti-Islamic (Muslim)',
                          'Anti-Catholic', 'Anti-Multiple Religions, Group','Anti-Atheism/Agnosticism','Anti-Buddhist',
                          'Anti-Sikh', 'Anti-Other Christian', 'Anti-Hindu','Anti-Eastern Orthodox (Russian, Greek, Other)',
                          "Anti-Jehovah's Witness", 'Anti-Church of Jesus Christ'],
             'Disability': ['Anti-Physical Disability', 'Anti-Mental Disability'],
             'Gender': ['Anti-Female','Anti-Male'],
             'Gender Identity': ['Anti-Gender Non-Conforming','Anti-Transgender'],
             'Sexual Orientation': ['Anti-Gay (Male)','Anti-Lesbian (Female)', 'Anti-Bisexual','Anti-Heterosexual',
                                    'Anti-Lesbian, Gay, Bisexual, or Transgender (Mixed Group)']
            }

# Convert dictionary to an expanded list with 2 columns
bias_list = []
for key in bias_dict:
    for value in bias_dict[key]:
        bias_list.append([value, key])

# Create dataframe for biases
bias_df = pd.DataFrame(bias_list, columns=['bias', 'bias_category'])
bias_df['bias_id'] = bias_df.index
bias_df = bias_df[['bias_id', 'bias', 'bias_category']]

# Determine maximum column size for database schema
print(f'Number of records: {len(bias_df)}')
print(f"Max column size - bias: {bias_df['bias'].map(lambda x: len(x)).max()}")
print(f"Max column size - bias_category: {bias_df['bias_category'].map(lambda x: len(x)).max()}")

# Display dataframe
bias_df.head(15)

Number of records: 34
Max column size - bias: 57
Max column size - bias_category: 27


Unnamed: 0,bias_id,bias,bias_category
0,0,Anti-Black or African American,"Race, Ethnicity or Ancestry"
1,1,Anti-Hispanic or Latino,"Race, Ethnicity or Ancestry"
2,2,Anti-Arab,"Race, Ethnicity or Ancestry"
3,3,Anti-Asian,"Race, Ethnicity or Ancestry"
4,4,Anti-American Indian or Alaska Native,"Race, Ethnicity or Ancestry"
5,5,Anti-Native Hawaiian or Other Pacific Islander,"Race, Ethnicity or Ancestry"
6,6,Anti-White,"Race, Ethnicity or Ancestry"
7,7,Anti-Other Race/Ethnicity/Ancestry,"Race, Ethnicity or Ancestry"
8,8,"Anti-Multiple Races, Group","Race, Ethnicity or Ancestry"
9,9,Anti-Jewish,Religion


In [30]:
# Export dataframe to csv 
bias_df.to_csv('transformed_data/bias.csv', index=False)

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


In [31]:
# Add bias_id to dataframe and prep for export
inc_bias_final_df = inc_bias_df.merge(bias_df, on='bias', how='left')
inc_bias_final_df.drop(columns=['bias', 'bias_category'], inplace=True)

# Issue with composite primary keys in sqlalchemy
inc_bias_final_df.index.name = 'id'

# Check if nulls
print(inc_bias_final_df[inc_bias_final_df['bias_id'].isna()].sum())

# Display dataframe
inc_bias_final_df

incident_id    0
bias_id        0
dtype: int64


Unnamed: 0_level_0,incident_id,bias_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,141003,4
1,141004,33
2,141005,4
3,141006,4
4,141007,4
...,...,...
82711,1440762,5
82712,1441769,4
82713,1442136,25
82714,1444656,6


In [32]:
# Export dataframe to csv 
inc_bias_final_df.to_csv('transformed_data/incident_bias.csv', index=False)

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Offense Tables (offense, offense_category, incident_offense) <a name="offense_tables"/> <a href=#transform style="text-decoration: none;"><abbr title="Back to Data Transformation">▲</abbr></a>

In [33]:
# Create copy of crime dataframe for incident offenses 
inc_offense_df = crime_data_df[['incident_id', 'offense_name']].copy()
print(f'Number of records before explode: {len(inc_offense_df)}')

# Explode rows with multiple offenses 
inc_offense_df.loc[:, 'offense_name'] = inc_offense_df['offense_name'].str.split(';')
inc_offense_df = inc_offense_df.explode('offense_name', ignore_index=True)
inc_offense_df.rename(columns={'offense_name' : 'offense'}, inplace=True)

# Display dataframe
print(f'Number of records after explode: {len(inc_offense_df)}')
inc_offense_df.head(10)

Number of records before explode: 81666
Number of records after explode: 84555


Unnamed: 0,incident_id,offense
0,141003,Simple Assault
1,141004,Intimidation
2,141005,Robbery
3,141006,Aggravated Assault
4,141007,Robbery
5,141008,Simple Assault
6,141009,Simple Assault
7,141010,Simple Assault
8,141011,Aggravated Assault
9,136965,Intimidation


In [34]:
# Check counts for each offense
inc_offense_df.groupby(['offense']).count().sort_values('offense')

Unnamed: 0_level_0,incident_id
offense,Unnamed: 1_level_1
Aggravated Assault,8908
All Other Larceny,1379
Animal Cruelty,5
Arson,494
Assisting or Promoting Prostitution,6
Betting/Wagering,1
Bribery,3
Burglary/Breaking & Entering,1707
Counterfeiting/Forgery,119
Credit Card/Automated Teller Machine Fraud,110


In [35]:
# Create dataframe for offenses
offense_df = pd.DataFrame(inc_offense_df['offense'].unique(), columns=['offense'])
offense_df = offense_df.sort_values('offense').reset_index(drop=True)
offense_df['offense_id'] = offense_df.index
offense_df = offense_df[['offense_id', 'offense']]

# Display dataframe
offense_df.head(15)

Unnamed: 0,offense_id,offense
0,0,Aggravated Assault
1,1,All Other Larceny
2,2,Animal Cruelty
3,3,Arson
4,4,Assisting or Promoting Prostitution
5,5,Betting/Wagering
6,6,Bribery
7,7,Burglary/Breaking & Entering
8,8,Counterfeiting/Forgery
9,9,Credit Card/Automated Teller Machine Fraud


In [36]:
# Add offense_id to dataframe and prep for export
inc_offense_final_df = inc_offense_df.merge(offense_df, how='left', on='offense')
inc_offense_final_df.drop(columns=['offense'], inplace=True)

# Issue with composite primary keys in sqlalchemy
inc_offense_final_df.index.name = 'id'

# Display dataframe
inc_offense_final_df

Unnamed: 0_level_0,incident_id,offense_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,141003,38
1,141004,23
2,141005,35
3,141006,0
4,141007,35
...,...,...
84550,1440762,38
84551,1441769,38
84552,1442136,38
84553,1444656,0


In [37]:
# Export dataframe to csv 
inc_offense_final_df.to_csv('transformed_data/incident_offense.csv', index=False)

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


In [38]:
# Import offense categories
path = Path('source_data/nibrs_group_a_offenses.csv')
offense_cat_df = pd.read_csv(path, header=0, names=['offense_category', 'code', 'offense', 'crimes_against'])

# Display dataframe
offense_cat_df.head(10)

Unnamed: 0,offense_category,code,offense,crimes_against
0,Animal Cruelty Offenses,720,Animal Cruelty,Society
1,Arson,200,Arson,Property
2,Assault Offenses,13A,Aggravated Assault,Person
3,Assault Offenses,13B,Simple Assault,Person
4,Assault Offenses,13C,Intimidation,Person
5,Bribery,510,Bribery,Property
6,Burglary/Breaking & Entering,220,Burglary/Breaking & Entering,Property
7,Commerce Violations,58A,Import Violations*,Society
8,Commerce Violations,58B,Export Violations*,Society
9,Commerce Violations,61A,Federal Liquor Offenses*,Society


In [39]:
# Clean up dataframe for merging

# Remove * in offense column and remove leading/trailing spaces
offense_cat_df['offense'] = offense_cat_df['offense'].str.replace('*', '')

# Add row for unspecified offenses
df = pd.DataFrame({'offense_category': ['Not Specified'], 'code': ['NA'], 'offense': ['Not Specified'],
                'crimes_against': ['Not Specified']})
offense_cat_df = pd.concat([offense_cat_df, df], ignore_index=True)

# Display dataframe
offense_cat_df.tail(5)

Unnamed: 0,offense_category,code,offense,crimes_against
67,Weapon Law Violations,520.0,Weapon Law Violations,Society
68,Weapon Law Violations,521.0,Violation of National Firearm Act of 1934,Society
69,Weapon Law Violations,522.0,Weapons of Mass Destruction,Society
70,Weapon Law Violations,526.0,Explosives,Society
71,Not Specified,,Not Specified,Not Specified


In [40]:
# Correct error in offenses dataframe
offense_df['offense'] = offense_df['offense'].str.replace('Gambling Equipment Violation',
                                                          'Gambling Equipment Violations')

In [41]:
# Add offense_id to dataframe and prep for export
offense_final_df = offense_df.merge(offense_cat_df, how='left', on='offense')
offense_final_df = offense_final_df[['offense_id', 'offense', 'code','offense_category', 'crimes_against']]

# Determine maximum column size for database schema
print(f"Max column size - offense: {offense_final_df['offense'].map(lambda x: len(x)).max()}")
print(f"Max column size - offense_category: {offense_final_df['offense_category'].map(lambda x: len(x)).max()}")
print(f"Max column size - crimes_against: {offense_final_df['crimes_against'].map(lambda x: len(x)).max()}")

# Display dataframe
offense_final_df

Max column size - offense: 43
Max column size - offense_category: 40
Max column size - crimes_against: 13


Unnamed: 0,offense_id,offense,code,offense_category,crimes_against
0,0,Aggravated Assault,13A,Assault Offenses,Person
1,1,All Other Larceny,23H,Larceny/Theft Offenses,Property
2,2,Animal Cruelty,720,Animal Cruelty Offenses,Society
3,3,Arson,200,Arson,Property
4,4,Assisting or Promoting Prostitution,40B,Prostitution Offenses,Society
5,5,Betting/Wagering,39A,Gambling Offenses,Society
6,6,Bribery,510,Bribery,Property
7,7,Burglary/Breaking & Entering,220,Burglary/Breaking & Entering,Property
8,8,Counterfeiting/Forgery,250,Counterfeiting/Forgery,Property
9,9,Credit Card/Automated Teller Machine Fraud,26B,Fraud Offenses,Property


In [42]:
# Export dataframe to csv 
offense_final_df.to_csv('transformed_data/offense.csv', index=False)

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Victim Tables (victim_type, incidents_victim_type) <a name="victim_tables"/> <a href=#transform style="text-decoration: none;"><abbr title="Back to Data Transformation">▲</abbr></a>

In [43]:
# Create copy of crime dataframe for incident victim types 
inc_victim_df = crime_data_df[['incident_id', 'victim_types']].copy()
print(f'Number of records before explode: {len(inc_victim_df)}')

# Explode rows with multiple victim types 
inc_victim_df.loc[:, 'victim_types'] = inc_victim_df['victim_types'].str.split(';')
inc_victim_df = inc_victim_df.explode('victim_types', ignore_index=True)
inc_victim_df.rename(columns={'victim_types' : 'victim_type'}, inplace=True)

# Display dataframe
print(f'Number of records after explode: {len(inc_victim_df)}')
inc_victim_df.head(10)

Number of records before explode: 81666
Number of records after explode: 83043


Unnamed: 0,incident_id,victim_type
0,141003,Individual
1,141004,Individual
2,141005,Individual
3,141006,Individual
4,141007,Individual
5,141008,Individual
6,141009,Individual
7,141010,Individual
8,141011,Individual
9,136965,Individual


In [44]:
# Check counts for each offense
inc_victim_df.groupby(['victim_type']).count().sort_values(['incident_id'], ascending=False)

Unnamed: 0_level_0,incident_id
victim_type,Unnamed: 1_level_1
Individual,65040
Other,5593
Business,4996
Government,2972
Religious Organization,2280
Society/Public,1440
Unknown,430
Law Enforcement Officer,233
Financial Institution,59


In [45]:
# Create dataframe for victim types
victim_type_df = pd.DataFrame(inc_victim_df['victim_type'].unique(), columns=['victim_type'])
victim_type_df['victim_type_id'] = victim_type_df.index
victim_type_df = victim_type_df[['victim_type_id', 'victim_type']]

# Determine maximum column size for database schema
print(f"Max column size - victim_type: {victim_type_df['victim_type'].map(lambda x: len(x)).max()}")

victim_type_df.head(10)

Max column size - victim_type: 23


Unnamed: 0,victim_type_id,victim_type
0,0,Individual
1,1,Religious Organization
2,2,Government
3,3,Society/Public
4,4,Business
5,5,Other
6,6,Unknown
7,7,Financial Institution
8,8,Law Enforcement Officer


In [46]:
# Reorder victim_types
victim_type_df['victim_type_id'].replace({1:3, 2:2, 3:4, 4:1, 5:7, 6:8, 7:5, 8:6}, inplace=True)
victim_type_df = victim_type_df.sort_values('victim_type_id')
victim_type_df

Unnamed: 0,victim_type_id,victim_type
0,0,Individual
4,1,Business
2,2,Government
1,3,Religious Organization
3,4,Society/Public
7,5,Financial Institution
8,6,Law Enforcement Officer
5,7,Other
6,8,Unknown


In [47]:
# Export dataframe to csv 
victim_type_df.to_csv('transformed_data/victim_type.csv', index=False)

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


In [48]:
# Add offense_id to dataframe and prep for export
inc_victim_final_df = inc_victim_df.merge(victim_type_df, on='victim_type', how='left')
inc_victim_final_df.drop(columns=['victim_type'], inplace=True)

# Issue with composite primary keys in sqlalchemy
inc_victim_final_df.index.name = 'id'

# Display dataframe
inc_victim_final_df

Unnamed: 0_level_0,incident_id,victim_type_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,141003,0
1,141004,0
2,141005,0
3,141006,0
4,141007,0
...,...,...
83038,1440762,0
83039,1441769,0
83040,1442136,0
83041,1444656,0


In [49]:
# Export dataframe to csv 
inc_victim_final_df.to_csv('transformed_data/incident_victim_type.csv', index=False)

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


### Location Table <a name="location_table"/> <a href=#transform style="text-decoration: none;"><abbr title="Back to Data Transformation">▲</abbr></a>

In [50]:
# Create copy of crime dataframe for incident offenses 
inc_location_df = crime_data_df[['incident_id', 'location_name']].copy()
print(f'Number of records before explode: {len(inc_location_df)}')

# Explode rows with multiple offenses 
inc_location_df.loc[:, 'location_name'] = inc_location_df['location_name'].str.split(';')
inc_location_df = inc_location_df.explode('location_name', ignore_index=True)
inc_location_df.rename(columns={'location_name' : 'location'}, inplace=True)

# Determine maximum column size for database schema
print(f'Number of records after explode: {len(inc_location_df)}')

# Display dataframe
inc_location_df.head(10)

Number of records before explode: 81666
Number of records after explode: 81790


Unnamed: 0,incident_id,location
0,141003,Highway/Road/Alley/Street/Sidewalk
1,141004,Residence/Home
2,141005,Highway/Road/Alley/Street/Sidewalk
3,141006,Highway/Road/Alley/Street/Sidewalk
4,141007,Highway/Road/Alley/Street/Sidewalk
5,141008,Highway/Road/Alley/Street/Sidewalk
6,141009,Highway/Road/Alley/Street/Sidewalk
7,141010,Restaurant
8,141011,Highway/Road/Alley/Street/Sidewalk
9,136965,Residence/Home


In [51]:
# Check counts for each offense
inc_location_df.groupby(['location']).count().sort_values(['incident_id'], ascending=False)

Unnamed: 0_level_0,incident_id
location,Unnamed: 1_level_1
Residence/Home,24298
Highway/Road/Alley/Street/Sidewalk,14653
Other/Unknown,9724
Parking/Drop Lot/Garage,4723
School/College,3264
Church/Synagogue/Temple/Mosque,3119
School-Elementary/Secondary,2702
Restaurant,1806
Commercial/Office Building,1563
School-College/University,1518


In [52]:
# Create dataframe for locations
location_df = pd.DataFrame(inc_location_df['location'].unique(), columns=['location'])
location_df = location_df.sort_values('location').reset_index()
location_df['location_id'] = location_df.index
location_df = location_df[['location_id', 'location']]

# Determine maximum column size for database schema
print(f"Max column size - location: {location_df['location'].map(lambda x: len(x)).max()}")

# Display dataframe
location_df.head(10)

Max column size - location: 45


Unnamed: 0,location_id,location
0,0,ATM Separate from Bank
1,1,Abandoned/Condemned Structure
2,2,Air/Bus/Train Terminal
3,3,Amusement Park
4,4,Arena/Stadium/Fairgrounds/Coliseum
5,5,Auto Dealership New/Used
6,6,Bank/Savings and Loan
7,7,Bar/Nightclub
8,8,Camp/Campground
9,9,Church/Synagogue/Temple/Mosque


In [53]:
# Export dataframe to csv 
location_df.to_csv('transformed_data/location.csv', index=False)

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv


In [54]:
# Add location_id to dataframe and prep for export
inc_location_final_df = inc_location_df.merge(location_df, on='location', how='left')
inc_location_final_df.drop(columns=['location'], inplace=True)

# Issue with composite primary keys in sqlalchemy
inc_location_final_df.index.name = 'id'

# Display dataframe
inc_location_final_df

Unnamed: 0_level_0,incident_id,location_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,141003,24
1,141004,35
2,141005,24
3,141006,24
4,141007,24
...,...,...
81785,1440762,27
81786,1441769,7
81787,1442136,35
81788,1444656,35


In [55]:
# Export dataframe to csv 
inc_location_final_df.to_csv('transformed_data/incident_location.csv', index=False)

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv
