# Fourth Notebook: Calculate Deaths, Polish Datasets, Redact, & Apply Filter.

Notebook calculates the deaths 365 days before and after each officer was slain (DMFS Officers) or Civilian was fatally shot (DMFS Civilians), with respect to both officers and civilians. It also polishes the dataframe, reordering columns and making names more intuitive and consistent; rediacts DM-FS; and applies a filtering system to make it more user friendly.

# 0 | Imports

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter # Unique value ID

from thefuzz import fuzz # Fuzzy Search
import math
import os

# Generate random ID
import uuid
import random

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# 1 | Calculate (i) Cops that Civs Fatally shot and (ii) Civs that Cops Fatally Shot in both databases

### 1.1 | Aux Functions

In [None]:
# Function creates a dictionary of lists. Each lists stores the result of calculations for dead civs and dead cops in both DMFS Civs and DMFS Cops Databases

# Resets values of value dict by returning new dictionary
def initialize_value_dict():
    return {"dead_cops_365_days_before_all" : [], "dead_cops_365_days_before_fatal_shootings" : [], "dead_cops_365_days_after_all" : [], "dead_cops_365_days_after_shootings" : [],
              "dead_civs_365_days_before" : [], "dead_civs_365_days_after" : [],}


# Adds nev values to value dict
def add_to_value_dict(deaths_before, deaths_before_shootings, deaths_after, deaths_after_shootings, dead_civs_before, dead_civs_after):
    values_dict["dead_cops_365_days_before_all"].append(deaths_before)
    values_dict["dead_cops_365_days_before_fatal_shootings"].append(deaths_before_shootings)
    values_dict["dead_cops_365_days_after_all"].append(deaths_after)
    values_dict["dead_cops_365_days_after_shootings"].append(deaths_after_shootings)
    values_dict["dead_civs_365_days_before"].append(dead_civs_before)
    values_dict["dead_civs_365_days_after"].append(dead_civs_after)

### 1.2 | Loads Datasets

In [None]:
# Loads Main DFs
DIR_INPUT = "res/3.1 - Penultimate Databases/"
df_civs = pd.read_csv(f"{DIR_INPUT}DMFS Civilians.csv")
df_cops = pd.read_csv(f"{DIR_INPUT}DMFS Officers.csv")

# Converts date to format Pnadas can read
df_civs["date"] = pd.to_datetime(df_civs["date"])
df_cops["DEATH_DATE"] = pd.to_datetime(df_cops["DEATH_DATE"])

# Sort DFs so earliest date comes first
df_civs = df_civs.sort_values(by=["agency_responsible_1_ORI9", "date"], ascending=[True, True])
df_cops = df_cops.sort_values(by=["ORI9", "DEATH_DATE"], ascending=[False, True])

### 1.3 | DMFS OFFICERS: Calculate Dead Cops and Dead Civs before and after Each Cop's Death

In [None]:
# Initialize value_dict, which stores values to add to our database
values_dict = initialize_value_dict()

# Insert DEAD COPS columns after DEAD_COP_RANKS
index = df_cops.columns.get_loc("DEAD_COP_RANKS")
df_cops.insert(index + 1, "COPS_AGENCY_LOST_365_DAYS_BEFORE_DEATH_ALL", -1)
df_cops.insert(index + 2, "COPS_AGENCY_LOST_365_DAYS_BEFORE_DEATH_FATAL_SHOOTINGS_ONLY", -1)
df_cops.insert(index + 3, "COPS_AGENCY_LOST_365_DAYS_AFTER_DEATH", -1)
df_cops.insert(index + 4, "COPS_AGENCY_LOST_365_DAYS_AFTER_DEATH_FATAL_SHOOTINGS_ONLY", -1)

# Insert DEAD CIVS columns after KILLER HISTORY
index = df_cops.columns.get_loc("KILLER_CRIM_HISTORY")
df_cops.insert(index + 1, "CIVILIANS_FATALLY_SHOT_BY_AGENCY_365_DAYS_BEFORE_DEATH", -1)
df_cops.insert(index + 2, "CIVILIANS_FATALLY_SHOT_BY_AGENCY_365_DAYS_AFTER_DEATH", -1)

df_cops.head(0)

In [None]:
# Loops through each row of DMFS - Officers and calculates officers who died 365 days before & after an officer death
for i, ind_death in tqdm(df_cops.iterrows(), total=df_cops.shape[0] ):
    death_date = ind_death["DEATH_DATE"]

    # Ignores agencies that don't have a valid identifier (ORI9 code)
    if ind_death["ORI9"] == "-1":
        add_to_value_dict(-1, -1, -1, -1, -1, -1)
        continue
        
    # Date Range
    date_one_year_before_death = death_date - pd.Timedelta(days=365)
    date_one_year_after_death = death_date + pd.Timedelta(days=365)
    min_date = pd.to_datetime('2015-12-31') # The earliest date needed to measure a 365 day period before a death. Note the earliest date could be 1/1/2015.
    max_date = pd.to_datetime('2020-01-01') # The latest date needed to measure a 365 day period ater a death. Note the latest date could be 12/31/2020.
    
    # Filters databases to only include officer/civ deaths applicable to relevant agency
    df_cops_filtered = df_cops[(ind_death["ORI9"] == df_cops["ORI9"]) ] # List of officer deaths tha tthe LEA Experienced
    df_civs_filtered = df_civs[ (ind_death["ORI9"] == df_civs["agency_responsible_1_ORI9"]) | (ind_death["ORI9"] == df_civs["agency_responsible_2_ORI9"]) |  # List civillians LEA fatally shot 
                                (ind_death["ORI9"] == df_civs["agency_responsible_3_ORI9"]) | (ind_death["ORI9"] == df_civs["agency_responsible_4_ORI9"]) | 
                                (ind_death["ORI9"] == df_civs["agency_responsible_5_ORI9"]) ]

    # Death Count 365 days BEFORE death
    # The earliest datapoint was measured starting from 1/1/2015. Thus, any datapoint on/before 12/31/2015 can't accurately have a 365 day measurement period BEFORE death. Set vals to -1
    if death_date < min_date: # death date before min date
        dead_cops_before, dead_cops_before_shootings, dead_civs_before = -1, -1, -1
    else:
        dead_civs_before = df_civs_filtered[(df_civs_filtered["date"] >= date_one_year_before_death) & 
                                   (df_civs_filtered["date"] < death_date)].shape[0] # Note that civs doesn't have a dead_civ count: each row is 1 dead civ, hence why we use 'shape' over 'sum()'
        
        dead_cops_before = df_cops_filtered[(df_cops_filtered["DEATH_DATE"] >= date_one_year_before_death) & 
                                   (df_cops_filtered["DEATH_DATE"] < death_date)]["DEAD_COP_COUNT"].sum()

        dead_cops_before_shootings = df_cops_filtered[(df_cops_filtered["DEATH_DATE"] >= date_one_year_before_death) & 
                                             (df_cops_filtered["DEATH_DATE"] < death_date) & 
                                             (df_cops_filtered["KILLERS_WEAPON"] == "gun")]["DEAD_COP_COUNT"].sum()

    # Death Count 365 days AFTER death
    # The latest date that could've been entered is from 12/31/2020. Thus, any datapoints on/after 1/1/2020 can't have a 365 day measurement period AFTER death. Set vals to -1
    if death_date > max_date: # death date after max date
        dead_cops_after, count_after_shootings, dead_civs_after= -1, -1, -1
    else:
        dead_civs_after = df_civs_filtered[(df_civs_filtered["date"] > death_date) & 
                                   (df_civs_filtered["date"] <= date_one_year_after_death)].shape[0] 

        dead_cops_after = df_cops_filtered[(df_cops_filtered["DEATH_DATE"] > death_date) & 
                                  (df_cops_filtered["DEATH_DATE"] <= date_one_year_after_death )]["DEAD_COP_COUNT"].sum()

        dead_cops_after_shootings = df_cops_filtered[(df_cops_filtered["DEATH_DATE"] > death_date) & 
                                            (df_cops_filtered["DEATH_DATE"] <= date_one_year_after_death) & 
                                            (df_cops_filtered["KILLERS_WEAPON"] == "gun")]["DEAD_COP_COUNT"].sum()

    # Add all values to dictionary, to be added to dataframe
    add_to_value_dict(dead_cops_before, dead_cops_before_shootings, dead_cops_after, dead_cops_after_shootings, dead_civs_before, dead_civs_after)

In [None]:
# Adds stored values to dataframe

# Dead Cops
df_cops["COPS_AGENCY_LOST_365_DAYS_BEFORE_DEATH_ALL"] = values_dict["dead_cops_365_days_before_all"]
df_cops["COPS_AGENCY_LOST_365_DAYS_BEFORE_DEATH_FATAL_SHOOTINGS_ONLY"] =  values_dict["dead_cops_365_days_before_fatal_shootings"]
df_cops["COPS_AGENCY_LOST_365_DAYS_AFTER_DEATH"] =  values_dict["dead_cops_365_days_after_all"]
df_cops["COPS_AGENCY_LOST_365_DAYS_AFTER_DEATH_FATAL_SHOOTINGS_ONLY"] =  values_dict["dead_cops_365_days_after_shootings"]

# Dead Civs
df_cops["CIVILIANS_FATALLY_SHOT_BY_AGENCY_365_DAYS_BEFORE_DEATH"] =  values_dict["dead_civs_365_days_before"]
df_cops["CIVILIANS_FATALLY_SHOT_BY_AGENCY_365_DAYS_AFTER_DEATH"] =  values_dict["dead_civs_365_days_after"]

### 1.4 | DMFS CIVILIANS: Calculate Dead Cops and Dead Civs before and after Each Cop's Death

In [None]:
# Adds dead cops/dead civ columns to EACH RESPONSIBLE LEA in dmfs civs

def add_cols_to_df_civ(agency_responsible_col):
    # Insert DEAD Civs after LEA Type
    index = df_civs.columns.get_loc(f"{agency_responsible_col}_FIPS_COUNTY")
    df_civs.insert(index + 1, f"civilians_fatally_shot_by_{agency_responsible_col}_365_days_before_death", -1)
    df_civs.insert(index + 2, f"civilians_fatally_shot_by_{agency_responsible_col}_365_days_after_death", -1)

    # Insert number of cops that civ killed AFTER dead civs
    df_civs.insert(index + 3, f"cops_from_{agency_responsible_col}_that_civillians_killed_365_days_before_death_for_all_deaths", -1)
    df_civs.insert(index + 4, f"cops_from_{agency_responsible_col}_that_civillians_killed_365_days_before_death_for_fatal_shootings_only", -1)
    df_civs.insert(index + 5, f"cops_from_{agency_responsible_col}_that_civillians_killed_365_days_after_death_for_all_deaths", -1)
    df_civs.insert(index + 6,  f"cops_from_{agency_responsible_col}_that_civillians_killed_365_days_after_death_for_fatal_shootings_only", -1)


In [None]:
for agency_responsible_num in range (1,6):

    # Print status
    agency_responsible_col = f"agency_responsible_{agency_responsible_num}"
    print(f"Calculating dead civs & dead cops for [{agency_responsible_col}]")

    # Crucial Inits
    add_cols_to_df_civ(agency_responsible_col) # Adds dead cops / dead civs columns to DM-FS
    values_dict = initialize_value_dict()

    # **************************************************************************************************************************
    # Loops through each row, focusing on agency_responsible_1 during first loop, agency_responsible_2 during second loop, etc. 
    # **************************************************************************************************************************
    
    for i, ind_row in tqdm(df_civs.iterrows(), total=len(df_civs)):

        # Init key variables
        current_ORI = ind_row[f"{agency_responsible_col}_ORI9"]
        death_date = ind_row["date"]
        
        # Ignores agencies that don't have a valid identifier (ORI9 code)
        if current_ORI == "-1":
            add_to_value_dict(-1, -1, -1, -1, -1, -1)
            continue

        # Calculate Date Range
        date_one_year_before_death = death_date - pd.Timedelta(days=365)
        date_one_year_after_death = death_date + pd.Timedelta(days=365)
        min_date = pd.to_datetime('2015-12-31') # The earliest date needed to measure a 365 day period before a death. Note the earliest date could be 1/1/2015.
        max_date = pd.to_datetime('2020-01-01') # The latest date needed to measure a 365 day period ater a death. Note the latest date could be 12/31/2020.
    
        # Filters databases to only include officer/civ deaths applicable to relevant agency
        df_cops_filtered = df_cops[(current_ORI == df_cops["ORI9"]) ] # List of officer deaths that the LEA Experienced
        df_civs_filtered = df_civs[ (current_ORI == df_civs["agency_responsible_1_ORI9"]) | (current_ORI == df_civs["agency_responsible_2_ORI9"]) |  # List civillians LEA fatally shot 
                                    (current_ORI == df_civs["agency_responsible_3_ORI9"]) | (current_ORI == df_civs["agency_responsible_4_ORI9"]) | 
                                    (current_ORI == df_civs["agency_responsible_5_ORI9"]) ]

        # Death Count 365 days BEFORE death
        # The earliest datapoint was measured starting from 1/1/2015. Thus, any datapoint on/before 12/31/2015 can't accurately have a 365 day measurement period BEFORE death. Set vals to -1
        if death_date < min_date: # death date before min date
            dead_cops_before, dead_cops_before_shootings, dead_civs_before = -1, -1, -1
        else:
            dead_civs_before = df_civs_filtered[(df_civs_filtered["date"] >= date_one_year_before_death) & 
                                       (df_civs_filtered["date"] < death_date)].shape[0] # Note that civs doesn't have a dead_civ count: each row is 1 dead civ, hence why we use 'shape' over 'sum()'
        
            dead_cops_before = df_cops_filtered[(df_cops_filtered["DEATH_DATE"] >= date_one_year_before_death) & 
                                       (df_cops_filtered["DEATH_DATE"] < death_date) & (df_cops_filtered["EXCLUDE_DEATH"] != True) ]["DEAD_COP_COUNT"].sum()

            dead_cops_before_shootings = df_cops_filtered[(df_cops_filtered["DEATH_DATE"] >= date_one_year_before_death) & 
                                                 (df_cops_filtered["DEATH_DATE"] < death_date) & (df_cops_filtered["EXCLUDE_DEATH"] != True) & 
                                                 (df_cops_filtered["KILLERS_WEAPON"] == "gun")]["DEAD_COP_COUNT"].sum()


            # Death Count 365 days AFTER death
        # The latest date that could've been entered is from 12/31/2020. Thus, any datapoints on/after 1/1/2020 can't have a 365 day measurement period AFTER death. Set vals to -1
        if death_date > max_date: # death date after max date
            dead_cops_after, count_after_shootings, dead_civs_after= -1, -1, -1
        else:
            dead_civs_after = df_civs_filtered[(df_civs_filtered["date"] > death_date) & 
                                       (df_civs_filtered["date"] <= date_one_year_after_death)].shape[0] 

            dead_cops_after = df_cops_filtered[(df_cops_filtered["DEATH_DATE"] > death_date) & 
                                      (df_cops_filtered["DEATH_DATE"] <= date_one_year_after_death ) & (df_cops_filtered["EXCLUDE_DEATH"] != True) ]["DEAD_COP_COUNT"].sum()

            dead_cops_after_shootings = df_cops_filtered[(df_cops_filtered["DEATH_DATE"] > death_date) & 
                                                (df_cops_filtered["DEATH_DATE"] <= date_one_year_after_death) & (df_cops_filtered["EXCLUDE_DEATH"] != True) & 
                                                (df_cops_filtered["KILLERS_WEAPON"] == "gun")]["DEAD_COP_COUNT"].sum()

        # Add all values to dictionary, to be added to dataframe
        add_to_value_dict(dead_cops_before, dead_cops_before_shootings, dead_cops_after, dead_cops_after_shootings, dead_civs_before, dead_civs_after)
    
    # **************************************************************************************************************************
    # Adds Results of Loop for agency_responsible_X to DMFS
    # **************************************************************************************************************************

    # Dead Cops
    df_civs[f"cops_from_{agency_responsible_col}_that_civillians_killed_365_days_before_death_for_all_deaths"] = values_dict["dead_cops_365_days_before_all"]
    df_civs[f"cops_from_{agency_responsible_col}_that_civillians_killed_365_days_before_death_for_fatal_shootings_only"] =  values_dict["dead_cops_365_days_before_fatal_shootings"]
    df_civs[f"cops_from_{agency_responsible_col}_that_civillians_killed_365_days_after_death_for_all_deaths"] = values_dict["dead_cops_365_days_after_all"]
    df_civs[f"cops_from_{agency_responsible_col}_that_civillians_killed_365_days_after_death_for_fatal_shootings_only"] =  values_dict["dead_cops_365_days_after_shootings"]

    # Dead Civs
    df_civs[ f"civilians_fatally_shot_by_{agency_responsible_col}_365_days_before_death"] =  values_dict["dead_civs_365_days_before"]
    df_civs[ f"civilians_fatally_shot_by_{agency_responsible_col}_365_days_after_death"] =  values_dict["dead_civs_365_days_after"]

# 2 | Polish DMFS

### 2.1 | Aux Functions

In [None]:
# Moves columns within a Pandas dataframe so 'col_name_to_move' appears after 'col_that_col_name_to_move_will_be_placed_after'

def move_column(df, col_name_to_move, col_that_col_name_to_move_will_be_placed_after):

    cols = df.columns.tolist()
    cols.remove(col_name_to_move)

    # Find the index of "region" and insert "lea_name" right after it
    index = cols.index(col_that_col_name_to_move_will_be_placed_after) + 1
    cols.insert(index, col_name_to_move)

    # Returns list of re-ordered cols
    return cols

### 2.2 | DM-FS: Civillians. Makes a few brief corrections (as not many are neaded)

In [None]:
# Renames some columns to make them more consistent & intuitive
df_civs =  df_civs.rename(columns={
    'erroneous_inclusion_fe': 'contested_inclusion_fe',
    'erroneous_inclusion_mpv': 'contested_inclusion_mpv',
    'erroneous_inclusion_wp': 'contested_inclusion_wp',
    'date' : 'death_date',
})

In [None]:
# Excel messed up our dataset such that all "N/A" strings for unknown agency types, local governance names, and districts appears as null/NaN values. Let's fix this

# First, let's fill any absent URLs with filler text
df_civs["URL_FE"].fillna("No URL Avaliable", inplace=True) 
df_civs["URL_MPV"].fillna("No URL Avaliable", inplace=True) 
df_civs["URL_MANUAL_CORRECTION"].fillna("No URL Avaliable", inplace=True) 

# Second, any remaining NA values should be filled with "Not Applicable". In other words, these should only appear either if (i) the agency did not have an ORI, hence CSLLEA info couldn't be pulled 
# or (ii) if there was no agency responsible #2, #3, #4, etc.
df_civs.fillna("Not Applicable", inplace=True) 

In [None]:
assert( df_civs.isnull().sum().all() == 0)
print("Assertin passed. There are no more unknown / NaN / null values in DMFS - Civs")

### 2.3 | DM-FS: Officers: Add Anomolies via Anomolies Table

In [None]:
# Loads DF of issues with officers
DIR_OFFICER_ANOMOLIES = "../Fatally Shot Officers/"

df_cop_anomolies = pd.read_csv(f"{DIR_OFFICER_ANOMOLIES}DM-FS Officers Supplement - Table of Anomolies.csv")
df_cop_anomolies["Death_Date_Final"] = pd.to_datetime(df_cop_anomolies["Death_Date_Final"])
df_cop_anomolies["Death_Date_Original"] = pd.to_datetime(df_cop_anomolies["Death_Date_Original"])
df_cop_anomolies.head(1)

In [None]:
# Drops original anomoly flags
df_cops.drop(columns=["DATE_CORRECTION", "OTHER_ANOMOLY", "EXCLUDE_DEATH"], inplace = True)

# Creates new columns: one for anomolies, another for whether the death should be flagged for exclusion
df_cops["ANOMOLY"] = "None"
df_cops["EXCLUDE_DEATH"] = False
df_cops["EXCLUSION_REASON"] = "Not Applicable"

# Moves Notes to very end of df
df_cops["NOTES"] = df_cops.pop("NOTES") 

In [None]:
# Records each anomoly from anomoly-table to DMFS-Cops
# Note that the only exclusion applied are extreme date discrepencies

for i, anomolous_row in df_cop_anomolies.iterrows():
    
    # Finds the DMFS row in which there's an anomoly
    hit = df_cops.query(f"ORI9 == '{anomolous_row['ORI9']}' & DEATH_DATE == '{anomolous_row['Death_Date_Final']}'")
    dmfs_index_to_flag = hit.iloc[0].name

    # Ensures there's only ONE row with that anomoly. There should never be mroe than 1 (or less than 1)
    if hit.shape[0] != 1:
        print(f"ERROR - There's either >1 hit or 0 hits for [{anomolous_row['LEA']}]. There should only be exactly one hit - check the data.")
        assert False
    
    
    # Records Anomoly in DMFS COps
    df_cops.loc[dmfs_index_to_flag, "ANOMOLY"] = anomolous_row["Anomoly"]

    # After anomoly is recorded, flags for exclusion ONLY IF the anomoly is extreme - i.e., if it's an extreme date-related one    
    if anomolous_row["Correction(s)"] == "Exclude & Change Date":
        df_cops.loc[dmfs_index_to_flag, "EXCLUDE_DEATH"] = True

        # Calcualtes & Displays extreme date difference
        date_delta = abs(anomolous_row["Death_Date_Final"] - anomolous_row["Death_Date_Original"]).days
        print(f"Flagging {anomolous_row['LEA']} for exclusion because of date delta: {date_delta}")
        df_cops.loc[dmfs_index_to_flag, "EXCLUSION_REASON"] = f"Extreme date difference between date of injury and date of death: difference is greater than 6 months"
        

### 2.4 | DM-FS: Officers. Flag rows for exclusion

In [None]:
for i, ind_row in df_cops.iterrows():

    # DO not override our previous date-related exclusions
    if ind_row["EXCLUDE_DEATH"]:
        continue

    # Add new exclusions based on invalid ORI or invalid Date Range
    if ind_row["ORI9"] == "-1":
        df_cops.loc[i, "EXCLUDE_DEATH"] = True
        df_cops.loc[i, "EXCLUSION_REASON"] = "Invalid ORI9 code. Agency cannot be linked without valid identifier."
    elif ind_row["COPS_AGENCY_LOST_365_DAYS_BEFORE_DEATH_ALL"] == -1:
        df_cops.loc[i, "EXCLUDE_DEATH"] = True
        df_cops.loc[i, "EXCLUSION_REASON"] = "Failed to meet minimum date. Death failed to occur after 12/31/2015, preventing us from measuring a 365 day period before death."
    elif ind_row["COPS_AGENCY_LOST_365_DAYS_AFTER_DEATH"] == -1:
        df_cops.loc[i, "EXCLUDE_DEATH"] = True
        df_cops.loc[i, "EXCLUSION_REASON"] = "Failed to meet maximum date. Death failed to occur before 1/1/2020, preventing us from measuring a 365 day period after death."
    

In [None]:
df_cops["EXCLUSION_REASON"].value_counts()

### 2.5 | DM-FS: Officers. Polish Columns

In [None]:
# Makes all columns lower case & therefore more legible
df_cops.columns = df_cops.columns.str.lower()
df_cops.columns = df_cops.columns.str.replace("lea", "LEA").str.replace("ori", "ORI") # Capitalizes acronyms

In [None]:
# Moves columns around

# Moves state abbreviation so it comes right after state
df_cops = df_cops [ move_column(df_cops, col_name_to_move="region", col_that_col_name_to_move_will_be_placed_after="state_abr")  ]

# Moves "was ambush" to killer section
df_cops = df_cops [ move_column(df_cops, col_name_to_move="was_ambush", col_that_col_name_to_move_will_be_placed_after="killer_crim_history")  ]

# Moves vignette info so they're by the "flag" section
df_cops = df_cops [ move_column(df_cops, col_name_to_move="death_vignette", col_that_col_name_to_move_will_be_placed_after="fplace")  ]
df_cops = df_cops [ move_column(df_cops, col_name_to_move="unique_vignette", col_that_col_name_to_move_will_be_placed_after="death_vignette")  ]


In [None]:
# Renames some columns to make them more consistent & intuitive
df_cops = df_cops.rename(columns={
    'was_ambush': 'killer_initiated_ambush',
    'state_abr': 'LEA_state_abbreviation',
    'region': 'LEA_region',
})

In [None]:
# Replaces missing values
df_cops["notes"].fillna("No notes", inplace=True) 

# The only remaining missing values are those derived from CSLLEA like LEA Type, subtype, county, district, etc. Thus, the only way they'd be NA is if the agency doesn't have an ORI code.
# Thus, let's replace them with Not Applicable, just like we did with DMFS-Civ
df_cops.fillna("Not Applicable", inplace=True)

In [None]:
# Ensures there's no more NAs
assert( df_cops.isnull().sum().all() == 0)
print("Assertin passed. There are no more unknown / NaN / null values in DMFS - Officers")

# 3 | Redactions, Add Source Material, DM-FS (Unabridged) Export

### 3.1 | CSLLEA Redactions

We cannot redistribute data from the CSLLEA - owned by the ICPSR - until a license is negotiated. Thus, this code removes all CSLLEA data, and it replaces crucial CSLLEA data with a derivative.

In [None]:
unique_leas = list(df_civs['agency_responsible_1_ORI9']) + \
            list(df_civs['agency_responsible_2_ORI9']) + \
            list(df_civs['agency_responsible_3_ORI9']) + \
            list(df_civs['agency_responsible_4_ORI9']) + \
            list(df_civs['agency_responsible_5_ORI9']) + \
            list(df_cops['ORI9'])
unique_leas = np.unique(unique_leas)

print(f"Unique LEAs = {len(unique_leas)}")

In [None]:
# Maps each LEA to a unique ID
def generate_id(seed_value):
    random.seed(seed_value) 
    return str(uuid.UUID(int=random.getrandbits(128)))[:8]  # Generate a reproducible UUID


lea_id_mapping = {str(lea): generate_id(x) for x, lea in enumerate(unique_leas)}
lea_id_mapping["-1"] = "-1"

In [None]:
# Maps each unique ID to an ORI for both dataframes

# DMFS Cops
df_cops["ORI9"] = df_cops["ORI9"].map(lea_id_mapping)
df_cops.rename(columns={'ORI9': 'LEA_ID'}, inplace=True)

# DMFS Civilians
for x in range(1,6):
    df_civs[f'agency_responsible_{x}_ORI9'] = df_civs[f'agency_responsible_{x}_ORI9'].map(lea_id_mapping)
    df_civs.rename(columns={f'agency_responsible_{x}_ORI9': f'agency_responsible_{x}_LEA_ID'}, inplace=True)

In [None]:
# Replaces CSLLEEA Classificaiton scheme 
lea_type_remap = {
    '(000) Local police department': 'Local police department',
    '(001) Sheriff\'s office': 'Sheriff\'s office',
    '(005) State law enforcement agency': 'State law enforcement agency',
    '(006) Special jurisdiction': 'Special jurisdiction',
    '(007) Constable/Marshal': 'Constable/Marshal',
}

lea_subtype_remap = {
    '(888) Not applicable': 'Not Applicable',
    '(001) Public buildings/facilities': 'Public buildings/facilities',
    '(003) Transportation systems/facilities': 'Transportation systems/facilities',
    '(002) Natural resources/parks and recreation': 'Natural resources/parks and recreation',
    '(004) Criminal investigations': 'Criminal investigations',
    '(005) Special enforcement': 'Special enforcement',
}


In [None]:
# Applies mapping to DMFS Cops
df_cops ["LEA_type"] = df_cops ["LEA_type"].replace(lea_type_remap)
df_cops ["LEA_subtype_1"] = df_cops ["LEA_subtype_1"].replace(lea_subtype_remap)
df_cops.rename(columns={'LEA_subtype_1': 'LEA_subtype'}, inplace=True)
df_cops.drop(columns=["LEA_subtype_2"], inplace=True)

# Applies mapping to DMFS Cops
for x in range(1,6):
    df_civs [f"agency_responsible_{x}_LEA_TYPE"] = df_civs [f"agency_responsible_{x}_LEA_TYPE"].replace(lea_type_remap)
    df_civs [f"agency_responsible_{x}_LEA_SUBTYPE_1"] = df_civs [f"agency_responsible_{x}_LEA_SUBTYPE_1"].replace(lea_subtype_remap)
    df_civs.rename(columns={f"agency_responsible_{x}_LEA_SUBTYPE_1": f"agency_responsible_{x}_LEA_SUBTYPE"}, inplace=True)
    df_civs.drop(columns=[f"agency_responsible_{x}_LEA_SUBTYPE_2"], inplace=True)

In [None]:
# Dops all other CSLLEA ingested data
df_cops.drop(columns=["ORI7", "local_government_name", "district", "fips_state", "fips_county", "fcounty", "fplace"], inplace=True)

for x in range(1,6):
    df_civs.drop(columns=[f"agency_responsible_{x}_ORI7", f"agency_responsible_{x}_LOCAL_GOVERNMENT_NAME", 
                          f"agency_responsible_{x}_DISTRICT", f"agency_responsible_{x}_FIPS_STATE", f"agency_responsible_{x}_FIPS_COUNTY"], inplace=True)

In [None]:
# Adds anonymous IDs to our table of anomolies file (DMFS: Officers)
df_anomolies = pd.read_csv('../Fatally Shot Officers/DM-FS Officers Supplement - Table of Anomolies.csv')

df_anomolies["LEA_ID"] = df_anomolies["ORI9"].map(lea_id_mapping)

df_anomolies.to_csv('../Fatally Shot Officers/DM-FS Officers Supplement - Table of Anomolies with LEA IDs.csv', index = False)

### 3.2 | Sci Data Redactions (ARCHIVED)

Through both our intial submission and R&R, the journal asked us to make the following redactions. They reversed their decision prior to publication; however, the original redactions appear below.

In [None]:
# Civs - Redacts Date to just YYYY/MM
#df_civs['death_date'] = df_civs['death_date'].dt.strftime('%Y-%m')

In [None]:
# Civs - Drops columns that contain identifying information
#df_civs.drop(columns = ["victim_name", "URL_FE", "URL_MPV", "URL_MANUAL_CORRECTION"], inplace=True)

In [None]:
# Cops - Redacts Date to just YYYY/MM
#df_cops['death_date'] = df_cops['death_date'].dt.strftime('%Y-%m')

In [None]:
# Cops - Drops 
#df_cops.drop(columns=["death_vignette", "LEA_name",], inplace=True)

### 3.3 | Explicate Source Material

The journal requires a URL to the original source material that each datapoint came from. This section implements that requirement

In [None]:
# DM-FS Civs. Create columns that contain the URLs from the databases from which each datapoint was derived

# If a datapoint came from Fatal Encounters, list the URL
df_civs['source_FE'] = df_civs['included_in_fe'].replace( {True: "https://fatalencounters.org/",  False: "Not Applicable"} )

# If a datapoint came from Mapping Police Violence, list the URL
df_civs['source_MPV'] = df_civs['included_in_mpv'].replace( {True: "https://mappingpoliceviolence.org/", False: "Not Applicable"} )

# If a datapoint came from Washignton Post, list the URL
df_civs['source_WP'] = df_civs['included_in_wp'].replace( {True: "https://github.com/washingtonpost/data-police-shootings/tree/master/v2", False: "Not Applicable"} )

In [None]:
# Sanity Check
assert not ((df_civs['source_FE'] == "Not Applicable") & 
            (df_civs['source_MPV'] == "Not Applicable") & 
            (df_civs['source_WP'] == "Not Applicable")).any(), \
    "At least one datapoint in DM-FS Civilians doesn't have any source material"

print("Assertion passed! All datapoints in DM-FS Civilians have their source listed.")

In [None]:
# DM-Officers. Explicate the LEOKA report from which the data was extracted.

# Use death year as a heuristic for which LEOKA report the officer belonged to.
# For example, if they died in 2015, they almost certainly appeared in the 2015 LEOKA report.
df_cops['death_date'] = pd.to_datetime(df_cops['death_date'])
df_cops['LEOKA_year'] = df_cops['death_date'].dt.year

# However, there were some exceptoins --- some datapoints in which the death date does not correspond to the LEOKA year in which the death appeared in
# An enumerated list of these exceptions appears in the Technical Validation Tables, under Table 1 - LEOKA Anomolies and Their Correction.csv
# Let's manually read this table and apply the corrections - i.e., those in which the death year does not correspond to the LEOKA report

# Fatally wounded in 2014, appeared in LEOKA 2015
df_cops.loc[
    (df_cops['LEA_ID'] == 'e6180c7a') & (df_cops['death_date'] == pd.to_datetime("10/16/2014")), # Search Condition
    'LEOKA_year'] = 2015 # Value to change

# Fatally wounded in 2012, appeared in LEOKA 2017
df_cops.loc[
    (df_cops['LEA_ID'] == '6ba6362e') & (df_cops['death_date'] == pd.to_datetime("2/8/2012")),
    'LEOKA_year'] = 2017

# Fatally wounded in 1988, appeared in LEOKA 2017
df_cops.loc[
    (df_cops['LEA_ID'] == 'bdf88814') & (df_cops['death_date'] == pd.to_datetime("7/14/1988")),
    'LEOKA_year'] = 2017

# Fatally wounded in 1994, appeared in LEOKA 2018
df_cops.loc[
    (df_cops['LEA_ID'] == '2792fd11') & (df_cops['death_date'] == pd.to_datetime("12/10/1994")),
    'LEOKA_year'] = 2018

# Fatally wounded in 2007, appeared in LEOKA 2018
df_cops.loc[
    (df_cops['LEA_ID'] == '-1') & (df_cops['death_date'] == pd.to_datetime("6/18/2007")),
    'LEOKA_year'] = 2018

# Fatally wounded in 2003, appeared in LEOKA 2020
df_cops.loc[
    (df_cops['LEA_ID'] == '855c3c27') & (df_cops['death_date'] == pd.to_datetime("11/12/2003")),
    'LEOKA_year'] = 2020

In [None]:
# Finally, add a URL to the original LEOKA report:
df_cops['LEOKA_URL'] = df_cops['LEOKA_year'].replace( {2015: "https://ucr.fbi.gov/leoka/2015/officers-feloniously-killed/leoka-felonious-summaries-2015", 
                                                       2016: "https://ucr.fbi.gov/leoka/2016/officers-feloniously-killed/leoka-felonious-summaries-2016",
                                                      2017 : "https://ucr.fbi.gov/leoka/2017/resource-pages/felonious-summaries",
                                                      2018 : "https://ucr.fbi.gov/leoka/2018/resource-pages/summaries-officers-feloniously-killed",
                                                      2019 : "https://ucr.fbi.gov/leoka/2019/resource-pages/summaries-officers-feloniously-killed",
                                                      2020 : "https://cde.ucr.cjis.gov/LATEST/webapp/",} )

In [None]:
# Ensures DM-FS Cops has source URLs
assert df_cops['LEOKA_URL'].notna().all(), "There are null values in the LEOKA_URL column."
assert df_cops['LEOKA_URL'].str.strip().ne("").all(), "There are empty strings in the LEOKA_URL column."

print("Assertions passed. Each datapoint in DM-FS Officers has a URL to its source material")

### 3.4 | Export DM-FS Officers & Civilians

In [None]:
# DM-FS CIVILIANS

# Assigns victim_ID to index
df_civs.index.name = 'victim_ID' 

# Exports
df_civs.to_csv("../DMFS - Civilians.csv", index = True)

In [None]:
# DM-FS OFFICERS

# Assigns victim_ID to index
df_cops.index.name = 'incident_ID' 

# Exports
df_cops.to_csv("../DMFS - Officers.csv", index = True)

# 4 | Creates DM-FS (Cleaned) by Applying Filtering & Subsetting

### 4.1 | Aux Functions

In [None]:
# Takes a series e.g., df['race'] and displays the values alongside counts

def display_count_alongside_percentage(display_series, title):
    print(title)
    value_counts = display_series.value_counts()
    total_count = value_counts.sum()
    for index, count in value_counts.items():
        percentage = round( (count / total_count) * 100, 2)
        print(f"{index} = {count} ({percentage:.2f}%)")
    print(f"{'-'*20}")
    print("Total = ",total_count)

### 4.2 | DMFS Officers

In [None]:
df_cops.head(0)

In [None]:
# Displays exclusion reasons
display_count_alongside_percentage(df_cops["exclusion_reason"], "Exclusions in DMFS-Officers")

# Applies exclusions
df_cops = df_cops[ ~ df_cops["exclude_death"] ]
print("Final size = ", df_cops.shape[0])

In [None]:
# Drop non-essential columns in subsetted DF
#df_cops =  df_cops.rename(columns={'ORI9': 'ORI_code',})
df_cops.drop(columns=["exclude_death", "exclusion_reason", "anomoly",], inplace = True)

In [None]:
df_cops.to_csv("../DM-FS Officers (cleaned).csv", index=True)

### 4.3 | DMFS Civilians

In [None]:
# Drops victims who were killed by an agency without a valid ORI
drop_counter = 0
original_size = df_civs.shape[0]

size_before_exclusions = df_civs.shape[0]
df_civs = df_civs.query("agency_responsible_1_LEA_ID != '-1' | agency_responsible_2_LEA_ID != '-1' |  agency_responsible_3_LEA_ID != '-1' |  agency_responsible_4_LEA_ID != '-1' | agency_responsible_5_LEA_ID != '-1'")
rows_dropped = (size_before_exclusions - df_civs.shape[0])
drop_counter += rows_dropped


print(f" Dropped {rows_dropped} rows due to non-valid ORI codes  ({round(rows_dropped / original_size * 100, 2)}% of all rows)")

In [None]:
# Drops victims who fell BEFORE our date range
size_before_exclusions = df_civs.shape[0]
df_civs = df_civs.query("civilians_fatally_shot_by_agency_responsible_1_365_days_before_death != -1 | civilians_fatally_shot_by_agency_responsible_2_365_days_before_death != -1 | "
                        "civilians_fatally_shot_by_agency_responsible_3_365_days_before_death != -1 | civilians_fatally_shot_by_agency_responsible_4_365_days_before_death != -1 | "
                       "civilians_fatally_shot_by_agency_responsible_5_365_days_before_death != -1")
rows_dropped = (size_before_exclusions - df_civs.shape[0])
drop_counter += rows_dropped


print(f" Dropped {rows_dropped} rows due to the death occuring during/before 12/31/2015, as before that point, we cannot accurately measure fatal shootings 365 days before death"
      f"({round(rows_dropped / original_size * 100, 2)}% of all rows)")

In [None]:
# Drops victims who fell BEFORE our date range
size_before_exclusions = df_civs.shape[0]
df_civs = df_civs.query("civilians_fatally_shot_by_agency_responsible_1_365_days_after_death != -1 | civilians_fatally_shot_by_agency_responsible_2_365_days_after_death != -1 | "
                        "civilians_fatally_shot_by_agency_responsible_3_365_days_after_death != -1 | civilians_fatally_shot_by_agency_responsible_4_365_days_after_death != -1 | "
                       "civilians_fatally_shot_by_agency_responsible_5_365_days_after_death != -1")
rows_dropped = (size_before_exclusions - df_civs.shape[0])
drop_counter += rows_dropped
print(f" Dropped {rows_dropped} rows due to the death occuring during/before 1/1/2020, as before that point, we cannot accurately measure fatal shootings 365 days after death"
      f"({round(rows_dropped / original_size * 100, 2)}% of all rows)")

# Total exclusions
print(f"{'*'*30}\nTOTAL EXCLUSIONS = {drop_counter}/{original_size} ({round(rows_dropped / original_size * 100, 2)}%)")

In [None]:
# Drop columns

original_col_count = df_civs.shape[1]

# Drops all discrepancy columns
start_idx = df_civs.columns.get_loc("date_fe")
end_idx = df_civs.columns.get_loc("date_discrepancy_days")
df_civs.drop(columns=df_civs.columns[start_idx:end_idx+1], inplace = True)

print(f"Dropped {original_col_count - df_civs.shape[1]} columns (Original = {original_col_count})")

In [None]:
# Export
df_civs.to_csv("../DMFS Civilians (cleaned).csv", index=True)