# First Notebook: Create DMFS

The purpose of this Notebook is to create DM-FS by merging the three crowdsourced databases together, using Fatally Shot Civilians from 1 January 2015 until 31 December 2020. The code will highlight any discrepancy found within the databases. The end result of this notebook will be a DM-FS that needs manual corrections, as well as a list of manual corrections that need to be made.

# 0 | Imports & Settings

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter # Unique value ID

In [2]:
from thefuzz import fuzz
import math

In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# 1 | Loads & Cleans Datasets

Loads & Subsets all 3 DFs | Handles NAs | Recodes so their names are consistent

### 1.1 | Loads DFs

Loads data, ensures column names are consistent, filters only relevant deaths (non-suicides, gunshots), and ensures deaths are within date range

In [5]:
# Loads DFs
DIR_DATABASE = "res/1.0 - Original Fatal Shooting Databases/"
fe = pd.read_csv(f"{DIR_DATABASE}db_FE.csv", low_memory = False)
mpv = pd.read_csv(f"{DIR_DATABASE}db_MPV.csv")
wp= pd.read_csv(f"{DIR_DATABASE}db_WP.csv")

In [6]:
# Subsets MPV to just include fatal shootings. FE will be subsetted later, after it undergoes preprocessing
mpv = mpv[ mpv["cause_of_death"] == "Gunshot"]

In [7]:
# COLUMNS

# Subset databases to only those columns we need
fe = fe[ [' Date of injury resulting in death (month/day/year)', 'Name', 'Age', 'Gender', 'Race', 'Location of death (city)', 'State', 'Agency or agencies involved', 'Supporting document link'] ]
mpv = mpv[ ['date', 'name', 'age', 'gender', 'race', 'city', 'state', 'agency_responsible', 'news_urls', ] ]
wp = wp[ ['date', 'name', 'age', 'gender', 'race', 'city', 'state', 'agency_ids',] ]
wp["URL"] = "None"

# Makes col names consistent between databases
COL_NAMES = ['date', 'victim_name', 'victim_age', 'victim_gender', 'victim_race', 'city_of_death', 'state_of_death', 'agency_responsible', "URL"]
fe.columns = COL_NAMES
mpv.columns = COL_NAMES
wp.columns = COL_NAMES

In [8]:
# DATES

# Converts dates to proper datatype
fe['date'] = pd.to_datetime(fe['date'])
mpv['date'] = pd.to_datetime(mpv['date'])
wp['date'] = pd.to_datetime(wp['date'])

# Filters dates to only include deaths from Jan 1, 2015 until December 31, 2020
fe = fe[ (fe['date'] >= '2015-01-01') & (fe['date'] <= '2020-12-31') ]
mpv = mpv[ (mpv['date'] >= '2015-01-01') & (mpv['date'] <= '2020-12-31') ]
wp = wp[ (wp['date'] >= '2015-01-01') & (wp['date'] <= '2020-12-31') ]

# Ensures the most recent date comes first in all the databases, followed by name in alpha order
fe = fe.sort_values(by=['date', 'victim_name'], ascending=[False, True])
mpw = mpv.sort_values(by=['date', 'victim_name'], ascending=[False, True] )
wp = wp.sort_values(by=['date', 'victim_name'], ascending=[False, True] )

### 1.2 | Handles missing values

In [9]:
# Prints missing values
print("FE")
print(fe.isna().sum())

print("\nMPV")
print(mpv.isna().sum())

print("\nWP")
print(wp.isna().sum())

FE
date                    0
victim_name             0
victim_age            526
victim_gender          54
victim_race             0
city_of_death           3
state_of_death          0
agency_responsible      3
URL                     0
dtype: int64

MPV
date                    0
victim_name             0
victim_age            232
victim_gender           0
victim_race             1
city_of_death           4
state_of_death          0
agency_responsible      1
URL                     0
dtype: int64

WP
date                    0
victim_name           140
victim_age            170
victim_gender           1
victim_race           395
city_of_death           1
state_of_death          0
agency_responsible      0
URL                     0
dtype: int64


In [10]:
# Replaces misisng lEAs with 'Unknown'
fe['agency_responsible'] = fe['agency_responsible'].replace('', 'Unknown')
fe['agency_responsible'] = fe['agency_responsible'].fillna('Unknown')

mpv['agency_responsible'] = mpv['agency_responsible'].replace('', 'Unknown')
mpv['agency_responsible'] = mpv['agency_responsible'].fillna('Unknown')

wp['agency_responsible'] = wp['agency_responsible'].replace('', -1) # WP uses internal LEA codes, not text, to ID LEAs

In [11]:
# Replaces an unknown victim age with -1

fe['victim_age'] = pd.to_numeric(fe['victim_age'], errors='coerce')
fe['victim_age'] = fe['victim_age'].fillna(-1.0)
fe['victim_age'] = fe['victim_age'].astype(int)

mpv['victim_age'] = pd.to_numeric(mpv['victim_age'], errors='coerce')
mpv['victim_age'] = mpv['victim_age'].fillna(-1.0)
mpv['victim_age'] = mpv['victim_age'].astype(int)


wp['victim_age'] = pd.to_numeric(wp['victim_age'], errors='coerce')
wp['victim_age'] = wp['victim_age'].fillna(-1.0)
wp['victim_age'] = wp['victim_age'].astype(int)

In [12]:
# Replaces missing values with "Unknown" string for all remaining columns

# Victim Name
wp['victim_name'] = wp['victim_name'].fillna("Unknown")

# Victim Gender
fe['victim_gender'] = fe['victim_gender'].fillna("Unknown")
mpv['victim_gender'] = mpv['victim_gender'].fillna("Unknown")
wp['victim_gender'] = wp['victim_gender'].fillna("Unknown")

# Victim Race
mpv['victim_race'] = mpv['victim_race'].fillna("Unknown")
wp['victim_race'] = wp['victim_race'].fillna("Unknown")

# City of Death
fe['city_of_death'] = fe['city_of_death'].fillna("Unknown")
mpv['city_of_death'] = mpv['city_of_death'].fillna("Unknown")
wp['city_of_death'] = wp['city_of_death'].fillna("Unknown")

# Staet of Death
fe['state_of_death'] = fe['state_of_death'].fillna("Unknown")
mpv['state_of_death'] = mpv['state_of_death'].fillna("Unknown")
wp['state_of_death'] = wp['state_of_death'].fillna("Unknown")

In [13]:
# Prints missing values
assert(fe.isna().sum().eq(0).all() )
assert(mpv.isna().sum().eq(0).all() )
assert(wp.isna().sum().eq(0).all() )
print("All assertions passed. There are no NAs in any of the subsetted dataframes")

All assertions passed. There are no NAs in any of the subsetted dataframes


### 1.3 | Remaps Datasets so they all have Consistent Values

e.g., all datasets will list the same races, genders, etc. 

In [14]:
def print_vals(df_dict, col_to_print):
    for name, df in df_dict.items():
        print(f"Print database [{name}]")
        print(df[col_to_print].value_counts())
        print()

In [15]:
# GENDER

# Remaps WP Gender so it's consistent with FE & WP
wp['victim_gender'] = wp['victim_gender'].replace( {"male" : "Male", "female" : "Female"} )
print_vals( {"fe" : fe, "mpv" : mpv, "wp" : wp}, 'victim_gender')

Print database [fe]
victim_gender
Male           9662
Female         1039
Unknown          54
Transgender      14
Name: count, dtype: int64

Print database [mpv]
victim_gender
Male           5757
Female          293
Transgender       9
Unknown           4
Name: count, dtype: int64

Print database [wp]
victim_gender
Male       5685
Female      260
Unknown       1
Name: count, dtype: int64



In [16]:
# RACE

# Remaps Race so names are consistent
fe['victim_race'] = fe['victim_race'].replace( {"European-American/White" : "White", 
                                                "African-American/Black" : "Black",
                                               "Hispanic/Latino" : "Hispanic",
                                               "Race unspecified" : "Unknown",
                                               "Asian/Pacific Islander" : "Asian & Pacific Islander",
                                               "Native American/Alaskan" : "Native American", 
                                               "Middle Eastern" : "Other"} )

wp['victim_race'] = wp['victim_race'].replace( {"W" : "White",
                                               "B" : "Black",
                                               "H" : "Hispanic",
                                               "A" : "Asian & Pacific Islander",
                                               "N" : "Native American",
                                               "O" : "Other",
                                               "W;B;N" : "Other"} )

mpv['victim_race'] = mpv['victim_race'].replace( {"Unknown race" : "Unknown",
                                                 "Asian" : "Asian & Pacific Islander",
                                                 "Native Hawaiian and Pacific Islander" : "Asian & Pacific Islander"} )

# Prints current races
print_vals( {"fe" : fe, "mpv" : mpv, "wp" : wp}, 'victim_race')

Print database [fe]
victim_race
White                       4377
Black                       2601
Unknown                     1928
Hispanic                    1509
Asian & Pacific Islander     178
Native American              151
Other                         25
Name: count, dtype: int64

Print database [mpv]
victim_race
White                       2740
Black                       1484
Hispanic                    1117
Unknown                      485
Asian & Pacific Islander     131
Native American              106
Name: count, dtype: int64

Print database [wp]
victim_race
White                       2811
Black                       1456
Hispanic                    1056
Unknown                      395
Asian & Pacific Islander     119
Native American               88
Other                         21
Name: count, dtype: int64



### 1.4 | Handles Multiple LEAs

Does this by putting them into unique rows

In [17]:
# FE: multiple agencies sorted by ','
temp = fe['agency_responsible'].str.split(',', expand=True) # Split 'agency responsible' into different cols
temp.columns = [f'agency_responsible_{i+1}' for i in range(temp.shape[1])] # Rename them
fe = pd.concat([fe.drop('agency_responsible', axis=1), temp], axis=1) # add them to the OG DF

# MPV: multiple agencies sorted by ','
temp = mpv['agency_responsible'].str.split(',', expand=True) # Split 'agency responsible' into different cols
temp.columns = [f'agency_responsible_{i+1}' for i in range(temp.shape[1])] # Rename them
mpv = pd.concat([mpv.drop('agency_responsible', axis=1), temp], axis=1) # add them to the OG DF

# WP: multiple agencies sorted by ';'
temp = wp['agency_responsible'].str.split(';', expand=True) # Split 'agency responsible' into different cols
temp.columns = [f'agency_responsible_{i+1}' for i in range(temp.shape[1])] # Rename them
wp = pd.concat([wp.drop('agency_responsible', axis=1), temp], axis=1) # add them to the OG DF

For WP: Replace the IDs with the names of the LEAs

In [18]:
wp_agencies = pd.read_csv("WP - Fatal Shooting Agencies.csv")
wp_agencies.head(3)

Unnamed: 0,id,name,type,state,oricodes,total_shootings
0,3145,Abbeville County Sheriff's Office,sheriff,SC,SC00100,1
1,2576,Aberdeen Police Department,local_police,WA,WA01401,1
2,2114,Abilene Police Department,local_police,TX,TX22101,6


In [19]:
# Maps IDs to LEA Name
wp_agencies['id'] = wp_agencies['id'].astype(str)
id_to_name = dict(zip(wp_agencies['id'], wp_agencies['name']))
id_to_name['-1'] = "Unknown"

In [20]:
for i in range(1, 6):
    wp[f'agency_responsible_{i}'] = wp[f'agency_responsible_{i}'].astype(str) # Ensures similar datatypes
    wp[f'agency_responsible_{i}'] = wp[f'agency_responsible_{i}'].replace(id_to_name)

### 1.5 | Check that databases are consistent 

In [21]:
fe.head(3)

Unnamed: 0,date,victim_name,victim_age,victim_gender,victim_race,city_of_death,state_of_death,URL,agency_responsible_1,agency_responsible_2,agency_responsible_3,agency_responsible_4,agency_responsible_5,agency_responsible_6,agency_responsible_7,agency_responsible_8
1998,2020-12-31,David Randall Shephard,39,Male,White,Beaumont,TX,https://www.12newsnow.com/article/news/local/s...,Beaumont Police Department,,,,,,,
2000,2020-12-31,Name withheld by police,-1,Female,Unknown,Odessa,TX,https://www.oaoa.com/news/crime_justice/law_en...,Odessa Police Department,,,,,,,
1999,2020-12-31,Rodolfo Caraballo Moreno,-1,Male,Hispanic,Miami,FL,https://www.local10.com/news/local/2021/01/01/...,Miami Police Department,,,,,,,


In [22]:
mpv.head(3)

Unnamed: 0,date,victim_name,victim_age,victim_gender,victim_race,city_of_death,state_of_death,URL,agency_responsible_1,agency_responsible_2,agency_responsible_3,agency_responsible_4,agency_responsible_5,agency_responsible_6
4304,2020-12-31,David Randall Shephard,39,Male,White,Beaumont,TX,https://www.usnews.com/news/best-states/texas/...,Beaumont Police Department,,,,,
4305,2020-12-31,Jeffrey Marvin,63,Male,Unknown,Unknown,IN,https://wsbt.com/news/local/indiana-state-poli...,Marshall County Sheriff's Department,,,,,
4306,2020-12-31,Rodolfo Caraballo Moreno,-1,Male,Hispanic,Miami,FL,https://www.local10.com/news/local/2021/01/01/...,Miami Police Department,,,,,


In [23]:
wp.head(3)

Unnamed: 0,date,victim_name,victim_age,victim_gender,victim_race,city_of_death,state_of_death,URL,agency_responsible_1,agency_responsible_2,agency_responsible_3,agency_responsible_4,agency_responsible_5
5944,2020-12-31,David Randall Shepherd,39,Male,White,Beaumont,TX,,Beaumont Police Department,,,,
5945,2020-12-31,Jeffrey Marvin,63,Male,Unknown,Starke County,IN,,Marshall County Sheriff's Department,,,,
5942,2020-12-30,Christian Joseph Hall,19,Male,Asian & Pacific Islander,Hamilton Township,PA,,Pennsylvania State Police,,,,


# 2 | NLP Processing

Cleans victim name, city, and agency names so they're easier to search for and match between databases

### 2.1 | Victim Name

Slight variations in the name's spelling will prevent an exact match. Thus, let's standardize  names between databases.

In [24]:
# Standardizes  "Name withheld"
fe["victim_name"] = fe["victim_name"].replace({"Name withheld by police" : "Not Disclosed"})
mpv["victim_name"] = mpv["victim_name"].replace({"Name Withheld" : "Not Disclosed"})
wp["victim_name"] = wp["victim_name"].replace({"Unknown" : "Not Disclosed"})

In [25]:
def extract_first_last_name(full_name):
    parts = full_name.split(' ')
    if len(parts) > 1:
        first_name = parts[0]
        last_name = parts[-1]
    else:
        first_name = parts[0]
        last_name = ''
    return f"{first_name}_{last_name}"

In [26]:
# Creates a dummy  name column. An abbreviated version of the victim's name we can use for comparison
for df in [fe, mpv, wp]:
    df["name_abr"] = df["victim_name"]
    df['name_abr'] = df['name_abr'].str.strip() # Removes whitespace
    df['name_abr'] = df['name_abr'].str.lower() # Makes lower

    # Fixes weird punctuation
    df['name_abr'] = df['name_abr'].str.replace("'", "")# Removes '
    df['name_abr'] = df['name_abr'].str.replace("\"", "")# Removes "
    df['name_abr'] = df['name_abr'].str.replace(".", "")# Removes .
    df['name_abr'] = df['name_abr'].str.replace("-", " ")# Removes .
    
    # Only keeps first & last names
    df['name_abr'] = df['name_abr'].apply(lambda x: pd.Series(extract_first_last_name(x))) # Removes middle name(s)

### 2.2 | Agency 

In [27]:
def remove_parentheses(text):

    if not text:
        return None
    
    result = []
    inside_parentheses = False
    for char in text:
        if char == '(':
            inside_parentheses = True
        elif char == ')':
            inside_parentheses = False
        elif not inside_parentheses:
            result.append(char)
    return ''.join(result).strip()

In [28]:
# Creates a dummy  name column. An abbreviated version of the victim's name we can use for comparison
for id, df in enumerate(tqdm([fe, mpv, wp])):
    for i in range(1,7):

        # WP only has up to 5 responsible LEAs, not 6
        if i == 6 and id == 2:
            break
            
        col = f'agency_responsible_abr_{i}'
        df[col] = df[f'agency_responsible_{i}'] # Old Col Name
        df[col] = df[col].str.lower() # Makes lower

        # Fixes weird punctuation
        df[col] = df[col].str.replace("'", "")# Removes '
        df[col] = df[col].str.replace("\"", "")# Removes "
        df[col] = df[col].str.replace(".", "")# Removes .
        df[col] = df[col].str.replace("-", " ")# Removes -

        # Removes 'department' and 'office' b/c theyr'e sometimes used interchangeably between databases
        df[col] = df[col].str.replace(" department", "")
        df[col] = df[col].str.replace(" office", "")

        # Removes the (parentheses) that WP adds to some LEAs as notes
        df[col] = df[col].apply(lambda x: pd.Series(remove_parentheses(x))) # Removes middle name(s)
        
        # Misc
        df[col] = df[col].str.replace("sheriffs", "sheriff")
        df[col] = df[col].str.replace("bureau of", "")
        df[col] = df[col].str.replace("division of", "")
        df[col] = df[col].str.replace("metropolitan", "")
        df[col] = df[col].str.replace("department", "")
        df[col] = df[col].str.replace("deparment", "")
        df[col] = df[col].str.replace("county", "")
        df[col] = df[col].str.replace("  ", " ")
        df[col] = df[col].str.replace(" ", "_")
        df[col] = df[col].str.strip() # Removes whitespace

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:09<00:00,  3.26s/it]


### 2.3 | City of Death

In [29]:
# Creates a dummy  name column. An abbreviated version of the victim's name we can use for comparison
for id, df in enumerate(tqdm([fe, mpv, wp])):
            
    col = 'city_of_death_abr'
    df[col] = df['city_of_death'] # Old Col Name

    # Fixes weird punctuation
    df[col] = df[col].str.lower() # Makes lower
    df[col] = df[col].str.replace("'", "")# Removes '
    df[col] = df[col].str.replace("\"", "")# Removes "
    df[col] = df[col].str.replace(".", "")# Removes .
    df[col] = df[col].str.replace("-", " ")# Removes -

    # Misc
    df[col] = df[col].str.replace("  ", " ")
    df[col] = df[col].str.strip() # Removes whitespace

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 89.47it/s]


# 3 | Subsets FE to just (i) fatal shootings that were (ii) non gunshots

FE was subsetted so late into the code so that the entire dataset can undergo preprocessing. We're going to split the dataset into (i) suicides (to be compared to WP and MPV later) and (ii) non-suicides, and the non-suicides will be our new FE dataframe

### 3.1 | Performs Subsetting

In [30]:
fe_original = pd.read_csv(f"{DIR_DATABASE}db_FE.csv", low_memory = False)

# Converts dates to proper datatype
fe_original[' Date of injury resulting in death (month/day/year)'] = pd.to_datetime(fe_original[' Date of injury resulting in death (month/day/year)'])

# Filters dates to only include deaths from Jan 1, 2015 until December 31, 2020
fe_original = fe_original[ (fe_original[' Date of injury resulting in death (month/day/year)'] >= '2015-01-01') & (fe_original[' Date of injury resulting in death (month/day/year)'] <= '2020-12-31') ]

# Ensures the most recent date comes first in all the databases, followed by name in alpha order
fe_original = fe_original.sort_values(by=[' Date of injury resulting in death (month/day/year)', 'Name'], ascending=[False, True])

In [31]:
# Makes sure the indices are equal
assert( fe.index == fe_original.index ).all()
print("Assertion passed. All FE indices are the same as fe_original indices")

Assertion passed. All FE indices are the same as fe_original indices


In [32]:
# Gets indices to use - i.e., non-suicidal fatal shootings
fe_indices_shootings = fe_original[ (fe_original["Highest level of force"] == "Gunshot") & (fe_original["Intended use of force (Developing)"] != "Suicide") ].index #FE: Exclude Suicides
fe_indices_suicides = fe_original[ fe_original["Intended use of force (Developing)"] == "Suicide"].index

fe_suicides = fe.loc[fe_indices_suicides]
fe = fe.loc[fe_indices_shootings]

print(f"Suicides = {fe_suicides.shape[0]}")
print(f"Non-Suicide Fatal Shootings = {fe.shape[0]}")

Suicides = 1376
Non-Suicide Fatal Shootings = 6590


### 3.2 | Cleans up all datasets

In [33]:
# Drops columns 'agency_responsible_7' & agency_responsible_8 because only fe_suicides has 7-8 LEAs invovled
fe.drop(columns=["agency_responsible_7", "agency_responsible_8"], inplace = True)
fe_suicides.drop(columns=["agency_responsible_7", "agency_responsible_8"], inplace = True)

In [34]:
# Finally, reset all dataframe indices
fe = fe.reset_index(drop=True)
fe_suicides = fe_suicides.reset_index(drop=True)
mpv = mpv.reset_index(drop=True)
wp = wp.reset_index(drop=True)

In [35]:
fe.head(2)

Unnamed: 0,date,victim_name,victim_age,victim_gender,victim_race,city_of_death,state_of_death,URL,agency_responsible_1,agency_responsible_2,agency_responsible_3,agency_responsible_4,agency_responsible_5,agency_responsible_6,name_abr,agency_responsible_abr_1,agency_responsible_abr_2,agency_responsible_abr_3,agency_responsible_abr_4,agency_responsible_abr_5,agency_responsible_abr_6,city_of_death_abr
0,2020-12-31,David Randall Shephard,39,Male,White,Beaumont,TX,https://www.12newsnow.com/article/news/local/s...,Beaumont Police Department,,,,,,david_shephard,beaumont_police,,,,,,beaumont
1,2020-12-31,Rodolfo Caraballo Moreno,-1,Male,Hispanic,Miami,FL,https://www.local10.com/news/local/2021/01/01/...,Miami Police Department,,,,,,rodolfo_moreno,miami_police,,,,,,miami


In [36]:
# Records size of original database (to help create descriptive stats later, in Loop III statistics)
# We're doing this now because, as the program progresses, matched entries in MPV and WP will be deleted from their respective dataframes to avoid a double match

fe_original_size = fe.shape[0]
mpv_original_size = mpv.shape[0]
wp_original_size = wp.shape[0]

print(f"Fatally shot civillians by database:\nFE = {fe_original_size}\nMPV = {mpv_original_size}\nWP = {wp_original_size}")

Fatally shot civillians by database:
FE = 6590
MPV = 6063
WP = 5946


# 4 | Loop Auxilary Functions

Initiates a sanity check & defines several auxilary functions that are used throughout the loop. The loop is the code that merges the databases with eachother to create DM-FS

### 4.1 | Sanity Checks

In [37]:
# Check to make sure all LEAs contain same number of states
assert( len( np.unique(fe["state_of_death"])  ) == len( np.unique(mpv["state_of_death"])  ) == len( np.unique(wp["state_of_death"])  ) )
print("Assertion passed. All databases contain 51 states (50 states + Puerto Rico)")

Assertion passed. All databases contain 51 states (50 states + Puerto Rico)


In [38]:
print(f"Size of fe = {fe.shape[0]}")
print(f"Size of mpv = {mpv.shape[0]}")
print(f"Size of wp = {wp.shape[0]}")

Size of fe = 6590
Size of mpv = 6063
Size of wp = 5946


FE is the most extensive of the 3 databases.

### 4.2 | Fuzzy Search Functions

In [39]:
def fuzzy_city_match(row_city):
    threshold = 90
    score = fuzz.ratio(row_city, city)
    return score >= threshold

In [40]:
def get_candidate_rows_via_fuzzzy_name_search(db, db1_name, db2_name, threshold = 70):

    # Ensures user only inputs a 'Y' or an 'N'
    def verify_fuzzy_match():
        while True:
            user_input = input("Confirm Fuzzy Match? Type 'Y' for Yes or 'N' for No: ").strip().upper()
            if user_input in ['Y', 'N']:
                return user_input
            else:
                print("Invalid input. Please enter 'Y' for Yes or 'N' for No.")

    candidate_indices = []
    
    for candidate_index, candidate_row in db.iterrows():
        
        # Split each row's name into first and last names
        db1_first_name, db1_last_name = name.split("_")
        db2_first_name, db2_last_name = candidate_row["name_abr"].split("_")
    
        # Calculate fuzzy ratio for both first and last names
        first_name_score = fuzz.ratio(db1_first_name, db2_first_name)
        last_name_score = fuzz.ratio(db1_last_name, db2_last_name)
        fuzzy_match = (first_name_score >= threshold) and (last_name_score > threshold)

        # If fuzzy match, verify it
        if fuzzy_match:

            if USE_MANUAL_MATCHING_DATAFRAME:
                manual_verification_record = manual_fuzzy_match_verification_df.query(f"db1 == '{db1_name}' & db2 == '{db2_name}' & db1_index == {i} & db2_index == {candidate_index}")
                match_verified = manual_verification_record["match_verified"].iloc[0]
                if match_verified:
                    candidate_indices.append(candidate_index)
            else:
                print(f"VERIFY NAME MATCH: {db1_name} = [{name}] & {db2_name} = [{candidate_row['name_abr']}]")
                user_input = verify_fuzzy_match()

                # Adds result of our verification to our DF
                next_index = len(manual_fuzzy_match_verification_df)
                manual_fuzzy_match_verification_df.loc[next_index] = {"db1" : db1_name, "db2" : db2_name, "db1_index" : i, "db2_index" : candidate_index,
                                                                      "db1_name" : name, "db2_name" : candidate_row['name_abr'] , "match_verified" : (user_input == "Y")}
                
                if user_input == 'Y':
                    candidate_indices.append(candidate_index)

    return db.loc[candidate_indices]

In [41]:
# Does a fuzzy search to verify agency discrepencies. 'target_agency' is defined in the loop

def verify_agency_discrepency(agencies_1, agencies_2):

    threshold = 60
    agency_list_1, agency_list_2 = list(agencies_1), list(agencies_2)

    # If there's an agency difference (i.e., an additional or one less agency), then there MUST be a disrepency
    if len(agency_list_1) != len(agency_list_2):
        return True

    # If both databases have the exact same number of LEAs, let's do a fuzzy search to make sure they're really different
    for ind_a1 in agency_list_1:
        match_found = False
        for ind_a2 in agency_list_2:
            #print(f"{ind_a1} | {ind_a2} | {match}")
            
            # If we (i) have yet to find a match but (ii) found one, then flag it as true
            if fuzz.ratio(ind_a1, ind_a2) >= threshold:
                match_found = True

        # We're done with one iteration of the loop.
        # If we still haven't found a match, exit the loop becasue we found a discrepency
        if not match_found:
            return True

    # Else, we have a match for every part of the loop. No discrepency exist
    return False

### 4.3 | Discrepancy Check Functions

In [42]:
# Identifies minor date discrepencies. master_df is the DB that's (i) a copy of FE with (ii) the discrepencies listed/corected. 
# 'i' is its index, the current row it's on within the for loop that this method is called from, the one that cycles through every row of the database to check for issues
# 'fe', 'mpv', 'wp' are a single row in the respective database.
# First, we'll extract the dates, and we'll see if there are any differences. If none, exit the function
# Second, if one of the three databases is aberrant, note the 'odd one out' in the master_df & correct it (if necessary).
# Third, if all 3 databases can't agree, 

def check_for_date_discrepency(master_df, i, fe, mpv, wp):
    
    # Extracts date of victim's death
    fe_date, mpv_date, wp_date =  None if fe is None else fe["date"], None if mpv is None else mpv["date"], None if wp is None else wp["date"]

    # Calculates max date dsicrepency
    def calculate_date_discrepency():
        valid_dates = [date for date in [fe_date, mpv_date, wp_date] if date is not None] # Filters out "None" values
        return max((abs((d1 - d2).days) for i, d1 in enumerate(valid_dates) for d2 in valid_dates[i+1:]), default=None)
    
    # If all 2 or all 3 databases disagree, (i) flag it and (ii) list disagreements in database
    def flag_disagreement():
        
        # Flags Disagreement
        master_df.loc[i, "date"] = f"ERROR - {2 if mpv_date is None or wp_date is None or fe_date is None else 3} Databases Disagree"
        
        # Records the discrepencies
        if fe_date is not None:
            master_df.loc[i, "date_fe"] = fe_date
        if mpv_date is not None:
            master_df.loc[i, "date_mpv"] = mpv_date
        if wp_date is not None:
            master_df.loc[i, "date_wp"] = wp_date

    # Calculates and records date discrepency for all dates. Most common case will be '0'
    master_df.loc[i, "date_discrepancy_days"] = calculate_date_discrepency()
    
    # Handles only 2 databases
    if wp_date is None or mpv_date is None or fe_date is None:
        if (fe_date == mpv_date and wp_date is None) or (fe_date == wp_date and mpv_date is None) or (mpv_date == wp_date and fe_date is None): # Exits function if no discrepency found
            pass
        else: # Else, flag it & exit function
            flag_disagreement()
        return

    # Handles all 3 databases
    # If no discrepency is found, no need for further checks
    if fe_date == mpv_date == wp_date:
        return

    # If one discrepency is found, identifies source of it
    if fe_date == mpv_date and fe_date != wp_date: # WP Wrong
        master_df.loc[i, "date_wp"] = True 

    elif fe_date == wp_date and fe_date != mpv_date: # MPV Wrong
        master_df.loc[i, "date_mpv"] = True 

    elif mpv_date == wp_date and fe_date != mpv_date: # FE Wrong
        master_df.loc[i, "date_fe"] = True 
        master_df.loc[i, "date"] = mpv["date"] # Master_df is a copy of FE, so if FE is wrong, the date needs to be corrected

    # If all 3 databases can't agree, flag that
    else:
        flag_disagreement()

In [43]:
# Aux function of check_for_agency_discrepencies
# Compresses agencies_responsible into a set, and returns that set (after cleaning it)
# use_abbreviated = True will use 'agency_responsible_abr_', else it uses agency_responsible

def get_agencies_responsible(fe, mpv, wp, use_abbreviated = True):

    # Removes None and nans from a set
    def clean_set(input_set):
        return {item for item in input_set if item is not None and not (isinstance(item, float) and math.isnan(item)) and (not isinstance(item, str) or item.lower() != 'none') }

    # Inits the 3 sets
    fe_agencies, mpv_agencies, wp_agencies = [], [], []

    # Initializes them
    for x in range (1, 7):
        agency = f"agency_responsible{'_abr' if use_abbreviated else ''}_{x}"
        if (fe is not None) and agency in fe:
            fe_agencies.append(fe[agency] )
        if (mpv is not None) and agency in mpv:
            mpv_agencies.append(mpv[agency] )
        if (wp is not None) and agency in wp:
            wp_agencies.append(wp[agency] )

    # Cleans them
    fe_agencies= clean_set( set(fe_agencies) )
    mpv_agencies = clean_set( set(mpv_agencies) )
    wp_agencies = clean_set( set(wp_agencies) )

    # Returns
    return fe_agencies, mpv_agencies, wp_agencies

In [44]:
def check_for_agency_discrepency(master_df, i, fe, mpv, wp):

    # (i) flags disagreement and (ii) records the disagreeing agencies
    def identify_and_record_problematic_agencies(error_text = "ERROR - 2 Databases Disagree"):

        # Deletes the responsible LEAs from master_df
        for x in range (1, 7):
            master_df.loc[i, f"agency_responsible_{x}"] = ""
            master_df.loc[i, f"agency_responsible_abr_{x}"] = ""

        # Now, we need to record the disagreement
        master_df.loc[i, "agency_responsible_1"] = error_text

        # Lists the disagreement
        fe_agencies_unabridged, mpv_agencies_unabridged, wp_agencies_unabridged = get_agencies_responsible(fe, mpv, wp, False)
        if fe is not None:
            master_df.loc[i, f"agency_responsible_fe"] =  f"{fe_agencies_unabridged}".replace("{", "").replace("}", "").replace("\"", "").replace("'","")
        if mpv is not None:
            master_df.loc[i, f"agency_responsible_mpv"] =  f"{mpv_agencies_unabridged}".replace("{", "").replace("}", "").replace("\"", "").replace("'","")
        if wp is not None:
            master_df.loc[i, f"agency_responsible_wp"] =  f"{wp_agencies_unabridged}".replace("{", "").replace("}", "").replace("\"", "").replace("'","")
            
    # Gets a list of responsible agencies for both databases
    fe_agencies, mpv_agencies, wp_agencies = get_agencies_responsible(fe, mpv, wp, True)
    
    # If we only ahve 2 of the 3 databases, do a special check
    if mpv is None or wp is None or fe is None:

        # If the 2 databases match, great. No need to flag for a discrepency
        if ( fe_agencies == mpv_agencies and wp is None ) or ( fe_agencies == wp_agencies and mpv is None ) or ( wp_agencies == mpv_agencies and fe is None ) :
            return

        # If the 2 databases don't match, flag as a discrepency
        if fe is None:
            if verify_agency_discrepency(mpv_agencies, wp_agencies):
                identify_and_record_problematic_agencies("ERROR - 2 Databases Disagree")
        elif mpv is None or wp is None:    
            if verify_agency_discrepency(fe_agencies, mpv_agencies if mpv is not None else wp_agencies):
                identify_and_record_problematic_agencies("ERROR - 2 Databases Disagree")
            
        # Discrepencies marked; no need to pursue them further
        return
    
    # We have all 3 databases
    # If all 3 databases contain same agencies, that's great! No discrepency found; do nothing
    assert(mpv_agencies is not None and wp_agencies is not None and fe_agencies is not None)
    if fe_agencies == mpv_agencies == wp_agencies:
        return

        
    # If they don't, identify the source of that discrepency & correct it
    if fe_agencies == mpv_agencies and fe_agencies != wp_agencies: # WP Wrong
        
        if verify_agency_discrepency(fe_agencies, wp_agencies):
            master_df.loc[i, f"agency_responsible_wp"] =  True

    elif fe_agencies == wp_agencies and fe_agencies != mpv_agencies: # MPV Wrong
        
        if verify_agency_discrepency(fe_agencies, mpv_agencies):
            master_df.loc[i, f"agency_responsible_mpv"] =  True

    elif mpv_agencies == wp_agencies and fe_agencies != mpv_agencies: # FE Wrong
        if verify_agency_discrepency(mpv_agencies, fe_agencies):
            master_df.loc[i, f"agency_responsible_fe"] =  True
            
            # If FE was incorrect, we need to fix master_df so it now records the correct agencies responsible
            for x in range (1, 7):
                master_df.loc[i, f"agency_responsible_{x}"] = mpv[f"agency_responsible_{x}"] 
                master_df.loc[i, f"agency_responsible_abr_{x}"] = mpv[f"agency_responsible_abr_{x}"] 
    
    # If all 3 databases disagree, flag for manual correction
    else:
        identify_and_record_problematic_agencies("ERROR - 3 Databases Disagree")

In [45]:
# Takes in a row from each of the 3 databases and checks to see if there's a discrepency in for a specific var in that row (e.g., victim_age).
# First, it extracts the value from each of the 3 databases and stores it in the list_of_vals. E.g., it takes age from FE, MPV, and WP.
# Second, it asks ho w many unique values there are in this list? 
# If there are 3 unique values, none of the databases agree, so do a procedure that'll enable manual correction later
# If there are 2 unique values, 2 of the 3 databases agree, so uses a simple majority-voting system to correct abnormality
# Finally, if there is 1only 1 unique val, then there are NO DISCREPENCIEES, hence nothign needs to be corrected
# 'master_df' is the databases that (i) stores the discrepencies and (ii) corrects them
# Thus, if there is only 1 unique value, that database isn't modified; whereas if there's 2 or more, the discrepency is recorded in that database

def check_for_discrepency(master_df, i, fe_row, mpv_row, wp_row,
                         discrepency_name_dfs = "victim_age", discrepency_name_master_df = "age"):
    
        # Gets a list of relevant vars
        if mpv_row is None: 
            list_of_vals = [fe_row[discrepency_name_dfs], wp_row[discrepency_name_dfs] ]
        elif wp_row is None:
            list_of_vals = [fe_row[discrepency_name_dfs], mpv_row[discrepency_name_dfs] ]
        elif fe_row is None:
            list_of_vals = [mpv_row[discrepency_name_dfs], wp_row[discrepency_name_dfs] ]
        else: # if there's no None values, i.e., if all 3 databases returned a hit
            list_of_vals = [fe_row[discrepency_name_dfs], mpv_row[discrepency_name_dfs],wp_row[discrepency_name_dfs] ]

        # If there's a discrpeency
        if len( np.unique( list_of_vals ) ) > 1: 
            
            # If all 3 databases list a different age...
            # Initialize the age_fe values with the purported age values so it can be manually corrected
            if len( np.unique(list_of_vals) ) == 3:

                # Each of the 3 databases have a unique value, so record that in the master dataframe
                master_df.loc[i, f"{discrepency_name_master_df}_fe"] = fe_row[discrepency_name_dfs]
                master_df.loc[i, f"{discrepency_name_master_df}_mpv"] = mpv_row[discrepency_name_dfs]
                master_df.loc[i, f"{discrepency_name_master_df}_wp"] = wp_row[discrepency_name_dfs]

                # We don't know the true value anymore, so let's replace it with 'ERROR" so it can be manually corrected
                master_df.loc[i, discrepency_name_dfs] = "ERROR - 3 Databases Disagree"

            # If only one database lists a different age...
            if len( np.unique(list_of_vals) ) == 2:    

                # If all 3 databases returned a hit, go with the majority
                if (mpv_row is not None) and (wp_row is not None) and (fe_row is not None) :
                    # WP is the odd oen out
                    if fe_row[discrepency_name_dfs] == mpv_row[discrepency_name_dfs]:
                        master_df.loc[i, f"{discrepency_name_master_df}_wp"] = True # Identify the discrepency...
                        master_df.loc[i, discrepency_name_dfs] = fe_row[discrepency_name_dfs] # And make sure it's right

                    # MPV is the odd one out
                    elif fe_row[discrepency_name_dfs] == wp_row[discrepency_name_dfs]:
                        master_df.loc[i, f"{discrepency_name_master_df}_mpv"] = True 
                        master_df.loc[i, discrepency_name_dfs] = fe_row[discrepency_name_dfs] 

                    # FE is the odd one out
                    elif mpv_row[discrepency_name_dfs] == wp_row[discrepency_name_dfs]:
                        master_df.loc[i, f"{discrepency_name_master_df}_fe"] = True 
                        master_df.loc[i, discrepency_name_dfs] = mpv_row[discrepency_name_dfs]
    
                # If only two databases returned a hit, then don't resolve it... flag it for manual inspection
                else:
                    if fe_row is not None:
                        master_df.loc[i, f"{discrepency_name_master_df}_fe"] = fe_row[discrepency_name_dfs]
                    if mpv_row is not None:
                        master_df.loc[i, f"{discrepency_name_master_df}_mpv"] = mpv_row[discrepency_name_dfs]
                    elif wp_row is not None:
                        master_df.loc[i, f"{discrepency_name_master_df}_wp"] = wp_row[discrepency_name_dfs]

                    # We don't know the true value anymore, so let's replace it with 'ERROR" so it can be manually corrected
                    master_df.loc[i, discrepency_name_dfs] = "ERROR - 2 Databases Disagree"

In [46]:
# Burns through all 3 of the above check_for_discrepency methods
def check_for_all_discrepencies(master_df, i, fe_row, mpv_row, wp_row):

        # City
        check_for_discrepency(master_df, i, fe_row, mpv_row, wp_row,
                              discrepency_name_dfs = "city_of_death", discrepency_name_master_df = "city")
        # Age
        check_for_discrepency(master_df, i, fe_row, mpv_row, wp_row,
                             discrepency_name_dfs = "victim_age", discrepency_name_master_df = "age")

        # Gender
        check_for_discrepency(master_df, i, fe_row, mpv_row, wp_row,
                             discrepency_name_dfs = "victim_gender", discrepency_name_master_df = "gender")

        # Race
        check_for_discrepency(master_df, i, fe_row, mpv_row, wp_row,
                             discrepency_name_dfs = "victim_race", discrepency_name_master_df = "race")
        
        # Agency
        check_for_agency_discrepency(master_df, i, fe_row, mpv_row, wp_row)

        # Date (minor) (if it was major, a match wouldn't be found)
        check_for_date_discrepency(master_df, i, fe_row, mpv_row, wp_row)


### 4.4 | Other Functions

In [47]:
# Flags for manual review when there are multiple people (i) with similar names who (ii) died on similar dates (less than 3 days apart) (iii) in the same state 
# OR this occurs when the date_delta between the proposed match is more than 3 days apart
# In short, prints out the proposed match in DB1 and DB2, and the author manually flags as a match or not
# Output: (i) records decision in dataframe and (ii) records the candidate row (if match) or None (if no match)
 
def manually_verify_match(candidate_row, db2_index, date_delta, db1_name, db2_name):

    # Ensures user only inputs a 'Y' or an 'N'
    def get_and_verify_user_input():
        while True:
            user_input = input("Do these match? Type 'Y' for Yes or 'N' for No: ").strip().upper()
            if user_input in ['Y', 'N']:
                return user_input
            else:
                print("Invalid input. Please enter 'Y' for Yes or 'N' for No.")
                
    
    print(f"DB = {db1_name} & {db2_name} | Date Delta = {date_delta} | Names = {name} & {candidate_row['name_abr']} | City = {city} & {candidate_row['city_of_death']} | State = {state} | Death Date = {date.strftime('%Y-%m-%d')} & {candidate_row['date'].strftime('%Y-%m-%d')}")
    user_input = get_and_verify_user_input()
    
    # Adds our decision to our manual_match_verificatoin_df
    next_index = len(manual_match_verification_df)
    print(f"Adding result to manual_match_verification_df @ index [{next_index}]")
    manual_match_verification_df.loc[next_index] = {"db1" : db1_name, "db2" : db2_name, "date_delta" : date_delta, 
                                                    "date_db1" : date, "date_db2" : candidate_row['date'], "name_db1" : name, "name_db2" : candidate_row['name_abr'], 
                                                    "city_db1" : city, "city_db2" : candidate_row['city_of_death'], "state_db1" : state, "state_db2" : state, 
                                                    "db1_index" : i, "db2_index" : db2_index, "match_verified" : (not user_input == "N")}
    # If no match, return None
    if user_input == "N":
        return None
    else:
        return candidate_row
    
            


In [48]:
# Called from initialize_row_to_search
# 

def init_candidate_row_from_multiple_viable_candidates(rows, low_date_delta_indices, low_date_delta_deltas, db1_name, db2_name):
    # Displays all possible candidates
    print("Multiple candidates detected. First, we will display all possible candidate matches below: " )
    for row_index, candidate_row in rows.loc[low_date_delta_indices].iterrows():
        print(f"{row_index} | Delta Date = {low_date_delta_deltas[row_index]} | Names = {name} & {candidate_row['name_abr']} | City = {city} & {candidate_row['city_of_death']} | State = {state} |  Death Date = {date.strftime('%Y-%m-%d')} & {candidate_row['date'].strftime('%Y-%m-%d')}")

    # Loop thorugh each candidate, and the user decides whether they want to keep it
    print(f"{'*'*50}")
    print("Second, we will go through the candidates one by one. Select which one (if any) you'd like to match, but only select ZERO or ONE")
    candidate_rows = []
    for row_index, candidate_row in rows.loc[low_date_delta_indices].iterrows():
        candidate_rows.append( manually_verify_match(candidate_row, row_index, low_date_delta_deltas[row_index], db1_name, db2_name) )

    # Selects the one candidate that the user deemed as valid
    candidate_rows = [row for row in candidate_rows if row is not None]
    if len( candidate_rows ) > 1:
        print("ERROR. Candidate_rows > 1. However only 0 or 1 candidate rows should be selected.")
        assert ( False )
    elif len (candidate_rows) == 1:
        return candidate_rows[0]
    else:
        return None

In [49]:
# Called from main loop. Converts mpv_rows & wp_rows into a single row that can be compared to FE via the iloc function
# If there's only 0 or 1 rows, the function returns nothing fancy
# However, if there are 2 or more rows (i.e., 2 or more possible 'hits'), then select the hit with the closets date
# Keep in mind that each hit already matches the (i) state, (ii) city, and (iii) name of the fatal shooting victim, so all hits are viable candidates.

def initialize_row_to_search(rows, fe_date, db1_name, db2_name):

    # *********************************************
    # HANDLES A SIMPLE ROW RESARCH (0 OR 1 POSSIBLE MATCHES)
    # *********************************************
    if rows.empty:
        search_results_dict["no_results_counter"] += 1
        return None
    elif len(rows) == 1:
        search_results_dict["1_result_counter"] += 1
        return rows.iloc[0]

    # *************************************************************************************
    # HANDLES MULTIPLE ROW SEARCHES (>=2 MATCHES) BY FINDING MATCHES WITH CLOSEST DATE
    # *************************************************************************************

    # Init variables
    search_results_dict["2_or_more_result_counter"] += 1
    min_date_delta_days, min_date_index = 999_999_999, -1
    low_date_delta_indices = [] # Counts number of times there's little difference in the two dates because, if there are, there are multiple matches
    low_date_delta_deltas = {} # Gets the date_deltas of any low dates. Stores as a dict where each key is the index
    
    # GO through each possible hit & find the hit with the closets date match (i.e., min(date_delta) )
    for row_index, ind_row in rows.iterrows():
        current_date = ind_row["date"]
        date_delta = abs((fe_date - current_date).days)
        #print(f"{row_index} : {date_delta}")

        # If there are multiple similar hits - i.e., multiple people with (i) similar names who died (ii) in the same state (iii) under similar dates, 
        # flag them by recording thier index
        if date_delta <= 3:
            low_date_delta_indices.append(row_index)
            low_date_delta_deltas[row_index] = date_delta

        # Records the hit who's death date is closest to the target, i..e, when date_delta is at th eminimum
        if date_delta < min_date_delta_days:
            min_date_delta_days = date_delta
            min_date_index = row_index

    # *****************************************************************************************************************
    # MANUALLY CORRECT ODD RESULTS: (I) IF MANY SIMILAR DEATHS ON SIMILAR DAYS OR (II) IF DEATH DATES > 3 DAYS APART
    # ******************************************************************************************************************

    # If using manual matching && there was a manual match
    if USE_MANUAL_MATCHING_DATAFRAME:
        if len( low_date_delta_indices ) > 1 or min_date_delta_days > MIN_DATE_DELTA_TOLLERANCE:
            manual_verification_record = manual_match_verification_df.query(f"db1 == '{db1_name}' & db2 == '{db2_name}' & db1_index == {i}")
            match_verified, match_index = manual_verification_record["match_verified"].iloc[0], manual_verification_record["db2_index"].iloc[0]
            
            return rows.loc[match_index] if match_verified else None
        
    # If there are multiple candidate rows, get user to select closest match (if any). If no match, returns 'None'
    if len( low_date_delta_indices ) > 1:
        candidate_row = init_candidate_row_from_multiple_viable_candidates(rows, low_date_delta_indices, low_date_delta_deltas, db1_name, db2_name)
    else:
        candidate_row = rows.loc[min_date_index]

    # Flags for manual review if the candidate match's death date is > 3 days away. If not a match, returns 'None'
    if min_date_delta_days > MIN_DATE_DELTA_TOLLERANCE and len( low_date_delta_indices ) <= 1: # the len( low_date_delta_indices) ensures no duplicate manual review
        return manually_verify_match(candidate_row, min_date_index, min_date_delta_days, db1_name, db2_name)

    # Prints output & returns candidate match
    return candidate_row

In [50]:
# Enumerates all the different discrepencies
# There are 2 reasons why agency_responsible could be wrong. Either an extra agency was present, or a database missed an agency. We'll record which one it is
DISCREPENCY_LIST = ["omission", "erroneous_inclusion","date", "city", "gender", "age", "race", "agency_responsible", ]
DB_NAMES = ["fe", "mpv", "wp"]

# Adds discrepencies to a dataframe. Modification is in-place: no need to return anything
def add_discrepencies_to_df(df, filler_value = None):
    for ind_discrepency in DISCREPENCY_LIST:
        for db_name in DB_NAMES:
            df[f"{ind_discrepency}_{db_name}"] = filler_value

# 5 | DM-FS Loop

This is the loop that merges all three databases together to create DM-FS. It works via the following logic.

First, DM-FS is a copy of Fatal Encounters because it's the most extensive of the three databases (see section 4.1). It then goes row-by-row and checks to see if each FE fatal shooting victim appears in MPV and WP. If it does appear, it checks for discrepancies.

Second, the remaining rows in MPV - those that were NOT matched to a FE row - are added one-at-a-time to DM-FS. Each time an MPV row is added, it's checked with WP, and if a match is found, it also checks that row for discrepancies between the two databases. In addition to checking for a match in WP, it also checks for a match in FE_Suicides to see if this fatal shooting was actually a suicide, which we flag as an erroneous inclusion.

Third, the only rows that are not matched / added to DM-FS at this point are those unmatched rows in Washington Post. We'll add each row to DM-FS. Just like MPV, however, we'll also check to see if each victim was a suicide by looking at FE_Suicides. If a suicide (match) was found, then we'll flag it as an erroneous inclusion.

Thus, in doing so, we'll include data from all three databases - FE, MPV, and WP - into DM-FS, checking each database against eachother when possiblem.

### 5.1 | Initialize Key Variables

In [51]:
# USE_MANUAL_MATCHING_DATAFRAME will replicate all the manual matching that the author did, using the 'manual_matching.csv' file
# If set to false, a new manual_matching dataframe will be made, and the user will have to manually match all conflicts himself/herself using the program
USE_MANUAL_MATCHING_DATAFRAME = True 
MATCH_DIR = "res/1.1 - DM-FS Loop Manual Matches/"

if USE_MANUAL_MATCHING_DATAFRAME:
    print("Using the manual_matches CSV file to re-create how the author matched conflicting databases. All matching will be automated.")
    manual_match_verification_df = pd.read_csv(f"{MATCH_DIR}manual_matches.csv")
    manual_fuzzy_match_verification_df = pd.read_csv(f"{MATCH_DIR}fuzzy_search_name_matches.csv")
else:
    print("Creating a new manual_matches CSV file. This means that, every time there is a match conflict, the user will need to manually specify whether there's a match.")
    manual_match_verification_df =  pd.DataFrame(columns=["db1", "db2", "date_delta", "date_db1", "date_db2", "name_db1", "name_db2", "city_db1", "city_db2", "state_db1", "state_db2", 
                                                          "db1_index", "db2_index", "match_verified"])
    manual_fuzzy_match_verification_df =  pd.DataFrame(columns=["db1", "db2", "db1_index", "db2_index", "db1_name", "db2_name", "match_verified"])

Using the manual_matches CSV file to re-create how the author matched conflicting databases. All matching will be automated.


In [52]:
# Var to record search statistics
search_results_dict = {"no_results_counter" : 0 , "1_result_counter" : 0, "2_or_more_result_counter" : 0}

# Date Search Vars
MIN_DATE_DELTA_TOLLERANCE = 3 # If a row is more than X days apart, flags it for manual review. Do not change this if USE_MANUAL_MATCHING_DATAFRAME is set to True.

# Makes FE & MPV consistent with master DF, i.e., DM-FS
fe = fe.rename(columns={"URL": "URL_FE"})
mpv = mpv.rename(columns={"URL": "URL_MPV"})

# Creates master DF, which is synonymous with DM-FS
master_df = pd.DataFrame( columns= list(fe.columns) + ["URL_MPV"] ).rename(columns={"URL": "URL_FE"})
add_discrepencies_to_df(master_df)
master_df["date_discrepancy_days"] = None
master_df = master_df[ [col for col in master_df.columns if col not in ["URL_FE", "URL_MPV"]]  + ["URL_FE", "URL_MPV"] ] # Re-orders DF so URLs are last two columns
master_df.head(1)

Unnamed: 0,date,victim_name,victim_age,victim_gender,victim_race,city_of_death,state_of_death,agency_responsible_1,agency_responsible_2,agency_responsible_3,agency_responsible_4,agency_responsible_5,agency_responsible_6,name_abr,agency_responsible_abr_1,agency_responsible_abr_2,agency_responsible_abr_3,agency_responsible_abr_4,agency_responsible_abr_5,agency_responsible_abr_6,city_of_death_abr,omission_fe,omission_mpv,omission_wp,erroneous_inclusion_fe,erroneous_inclusion_mpv,erroneous_inclusion_wp,date_fe,date_mpv,date_wp,city_fe,city_mpv,city_wp,gender_fe,gender_mpv,gender_wp,age_fe,age_mpv,age_wp,race_fe,race_mpv,race_wp,agency_responsible_fe,agency_responsible_mpv,agency_responsible_wp,date_discrepancy_days,URL_FE,URL_MPV


### 5.2 | Loop I: Add FE to DM-FS, Checking Against MPV and WP

In [53]:
# Gets Pre-Loop Stats
original_mpv_size = mpv.shape[0]
original_wp_size = wp.shape[0]

In [54]:
for i in range(0, len(fe) ) :
    print(f"Processing index {i} out of {len(fe)} ({round(i/len(fe) * 100, 2)}%)")
    fe_row = fe.iloc[i].copy()
    name, state, city, date = fe_row["name_abr"], fe_row["state_of_death"], fe_row["city_of_death_abr"], fe_row["date"]

    # Adds discrpeencies to our FE_row before addign it to Master DF
    add_discrepencies_to_df(fe_row, filler_value = False)
    master_df = pd.concat([master_df,  pd.DataFrame([fe_row]) ], ignore_index=True) # Note that only dataframes (fe_row), not series, can be concated 

    # *************************************
    # Checks for a match in other DBs
    # *************************************

    # Name, City, & State match
    mpv_rows = mpv.query(f"name_abr == '{name}' & state_of_death == '{state}'")# & city_of_death_abr == '{city}'")
    wp_rows = wp.query(f"name_abr == '{name}' & state_of_death == '{state}'")# & city_of_death_abr == '{city}'")

    # If no match, then salavage it with a fuzzy search
    if wp_rows.empty:      
        wp_rows = get_candidate_rows_via_fuzzzy_name_search (wp[ (wp['state_of_death'] == state) ], "fe", "wp")
    if mpv_rows.empty:      
        mpv_rows = get_candidate_rows_via_fuzzzy_name_search (mpv[ (mpv['state_of_death'] == state) ], "fe", "mpv")

    # *************************************
    # Prepares match & lack of match
    # *************************************

    # Initialize the row to search for mpv and wp. If empty, return 'None'.
    mpv_row = initialize_row_to_search(mpv_rows, fe_row["date"], "fe", "mpv")
    wp_row = initialize_row_to_search(wp_rows, fe_row["date"], "fe", "wp")
    
    # Compiles a list of the databases with matches (i.e., WP & MPV). If no match, flag as ommission
    if mpv_row is None:
        master_df.loc[i, "omission_mpv"] = True
    if wp_row is None:
        master_df.loc[i, "omission_wp"] = True

    # Exiits iteration if no matches were found
    if mpv_row is None and wp_row is None: 
        master_df.loc[i, "date_discrepancy_days"] = 0
        continue

    # Now that mpv_row is initialized, saves MPV URL
    if mpv_row is not None:
        master_df.loc[i, "URL_MPV"] = mpv_row["URL_MPV"]
        
    # *************************************************************************
    # Handles Match for either (i) all 3 databases or (ii) just 2 databases
    # *************************************************************************
    
    # If a match was found in all 3 databases...
    if not master_df.loc[i, "omission_mpv"] and not master_df.loc[i, "omission_wp"]:

        # Check for discrepencies in all columns
        check_for_all_discrepencies(master_df, i, fe_row, mpv_row, wp_row)

        # Drops the used rows from mpv and wp by index (i.e., '.name')
        mpv = mpv.drop(mpv_row.name)
        wp = wp.drop(wp_row.name)

    # If a match was found in 2 of the 3 databases
    elif not master_df.loc[i, "omission_mpv"] or not master_df.loc[i, "omission_wp"]:
        
        # If MPV has a hit
        if not master_df.loc[i, "omission_mpv"]:
            check_for_all_discrepencies(master_df, i, fe_row, mpv_row, None)
            mpv = mpv.drop(mpv_row.name)
            
        # If WP has a hit
        elif not master_df.loc[i, "omission_wp"]:
            check_for_all_discrepencies(master_df, i, fe_row, None, wp_row)
            wp = wp.drop(wp_row.name)

Processing index 0 out of 6590 (0.0%)
Processing index 1 out of 6590 (0.02%)
Processing index 2 out of 6590 (0.03%)
Processing index 3 out of 6590 (0.05%)
Processing index 4 out of 6590 (0.06%)
Processing index 5 out of 6590 (0.08%)
Processing index 6 out of 6590 (0.09%)
Processing index 7 out of 6590 (0.11%)
Processing index 8 out of 6590 (0.12%)
Processing index 9 out of 6590 (0.14%)
Processing index 10 out of 6590 (0.15%)
Processing index 11 out of 6590 (0.17%)
Processing index 12 out of 6590 (0.18%)
Processing index 13 out of 6590 (0.2%)
Processing index 14 out of 6590 (0.21%)
Processing index 15 out of 6590 (0.23%)
Processing index 16 out of 6590 (0.24%)
Processing index 17 out of 6590 (0.26%)
Processing index 18 out of 6590 (0.27%)
Processing index 19 out of 6590 (0.29%)
Processing index 20 out of 6590 (0.3%)
Processing index 21 out of 6590 (0.32%)
Processing index 22 out of 6590 (0.33%)
Processing index 23 out of 6590 (0.35%)
Processing index 24 out of 6590 (0.36%)
Processing in

#### Loop I Statistics & Sanity Check

In [55]:
matches = original_mpv_size - mpv.shape[0]
print(f"MPV Matches = {matches} / {original_mpv_size} ({round( matches/original_mpv_size * 100, 2)}%)") 

MPV Matches = 5789 / 6063 (95.48%)


In [56]:
matches = original_wp_size - wp.shape[0] #95% match
print(f"WP Matches = {matches} / {original_wp_size} ({round( matches/original_wp_size * 100, 2)}%)") 

WP Matches = 5601 / 5946 (94.2%)


In [57]:
print("Search Statistics (i.e., comparing every row of FE to MP and WP)")
search_results_dict

Search Statistics (i.e., comparing every row of FE to MP and WP)


{'no_results_counter': 1698,
 '1_result_counter': 11222,
 '2_or_more_result_counter': 260}

In [58]:
# FE should have queried each database for 2 * its lenght. I.e., one row in FE will produce two queries: one in MPV, one in WP.
# Thus, let's verify it

intended_queries = len(fe) * 2
print(f"FE should have produced {intended_queries} queries: one row shuld have queried MPV and WP, and FE is {len(fe)} rows so {len(fe)}*2. Let's make sure this is correct.")
assert(sum(search_results_dict.values()) == intended_queries)
print("Assertion Passed")

FE should have produced 13180 queries: one row shuld have queried MPV and WP, and FE is 6590 rows so 6590*2. Let's make sure this is correct.
Assertion Passed


### 5.3 | Loop II: Add MPV to DM-FS, checking against WP

In [59]:
# Prints pre-loop statistics
original_wp_size = wp.shape[0]
intended_queries = len(mpv) * 1 # Number of queries we expect MPV to make on the 2 datbases it'll query: FE_Suicdies and WP
print(f"MPV is expected to make {intended_queries} queries. In other words, each row will query one database: MPV. MPV contains {len(mpv)} rows, so there will be {len(mpv)} * 1 total queries.")

MPV is expected to make 274 queries. In other words, each row will query one database: MPV. MPV contains 274 rows, so there will be 274 * 1 total queries.


In [60]:
# Adds leftover MPV cases to the end of master_df, which is more-or-less a copy of FE

mpv_index = 0 # Note that we need a seperate mpv_index: the index in the loop refers to master_df index, which adds new data to the END of the existing dataframe
for i in range(len(master_df), len(master_df) + len(mpv) ) : # Adds MPV rows at the very end of master_df (i.e., why it starts at len(fe))

    # Displays progression
    print(f"Processing index {mpv_index} out of {len(mpv)-1} ({round(mpv_index/ (len(mpv)-1) * 100, 2)}%) to be added to master_df index {i}")

    # Extracts the base row
    mpv_row = mpv.iloc[mpv_index].copy() 
    mpv_index += 1 # So that, on next iteration, the next row of MPV Can be used
    name, state, city, date = mpv_row["name_abr"], mpv_row["state_of_death"], mpv_row["city_of_death_abr"], mpv_row["date"]

    # Adds discrpeencies to our row AND adds it to Master DF
    add_discrepencies_to_df(mpv_row, filler_value = False)
    master_df = pd.concat([master_df,  pd.DataFrame([mpv_row]) ], ignore_index=True) 

    # Adds relevant info to master_df, concerning MPV
    master_df.loc[i, "URL_MPV"] = mpv_row["URL_MPV"]
    master_df.loc[i, "omission_fe"] = True
    master_df.loc[i, "URL_FE"] = None
    
    # *************************************
    # Checks for a match in other DBs
    # *************************************

    # Name, City, & State match
    wp_rows = wp.query(f"name_abr == '{name}' & state_of_death == '{state}'")

    # If no match, then salavage it with a fuzzy search
    if wp_rows.empty:      
        wp_rows = get_candidate_rows_via_fuzzzy_name_search (wp[ (wp['state_of_death'] == state) ], "mpv", "wp")

    # *************************************
    # Handles match
    # *************************************
    
    # Initialize the row to search for mpv and wp. If empty, return 'None'.
    wp_row = initialize_row_to_search(wp_rows, mpv_row["date"], "mpv", "wp")
            
    # *************************************
    # Prepares match & lack of match
    # *************************************

    # Handles WP: Records as (i) an ommission or (ii) records discrepencies
    if wp_row is None:
        master_df.loc[i, "omission_wp"] = True
        master_df.loc[i, "date_discrepancy_days"] = 0
    else: # WP has a hit
        check_for_all_discrepencies(master_df, i, None, mpv_row, wp_row)
        wp = wp.drop(wp_row.name)

Processing index 0 out of 273 (0.0%) to be added to master_df index 6590
Processing index 1 out of 273 (0.37%) to be added to master_df index 6591
Processing index 2 out of 273 (0.73%) to be added to master_df index 6592
Processing index 3 out of 273 (1.1%) to be added to master_df index 6593
Processing index 4 out of 273 (1.47%) to be added to master_df index 6594
Processing index 5 out of 273 (1.83%) to be added to master_df index 6595
Processing index 6 out of 273 (2.2%) to be added to master_df index 6596
Processing index 7 out of 273 (2.56%) to be added to master_df index 6597
Processing index 8 out of 273 (2.93%) to be added to master_df index 6598
Processing index 9 out of 273 (3.3%) to be added to master_df index 6599
Processing index 10 out of 273 (3.66%) to be added to master_df index 6600
Processing index 11 out of 273 (4.03%) to be added to master_df index 6601
Processing index 12 out of 273 (4.4%) to be added to master_df index 6602
Processing index 13 out of 273 (4.76%) t

#### Loop II Statistics

In [61]:
print(f"WP: Added {original_wp_size - len(wp)} to master DF. Have {len(wp)} rows remaining")

WP: Added 104 to master DF. Have 241 rows remaining


In [62]:
print("Search Statistics: FE, MPV Complete")
search_results_dict

Search Statistics: FE, MPV Complete


{'no_results_counter': 1862,
 '1_result_counter': 11321,
 '2_or_more_result_counter': 271}

### 5.4 | Loop III: Add unmatched WP victims to DM-FS

In [63]:
wp_index = 0 # Note that we need a seperate mpv_index: the index in the loop refers to master_df index, which is much larger
starting_index = len(master_df)

for i in range(starting_index, len(wp)+starting_index) : # Adds MPV rows at the very end of master_df (i.e., why it starts at len(fe))
    
    # Displays progression
    print(f"Processing index {wp_index} out of {len(wp)-1} ({round(wp_index/ (len(wp)-1) * 100, 2)}%) to be added to master_df index {i}")

    # Extracts row
    wp_row = wp.iloc[wp_index].copy()
    wp_index += 1
    name, state, city, date = wp_row["name_abr"], wp_row["state_of_death"], wp_row["city_of_death_abr"], wp_row["date"]

    # Add row to master DF
    add_discrepencies_to_df(wp_row, filler_value = False)
    master_df = pd.concat([master_df,  pd.DataFrame([wp_row]) ], ignore_index=True) 
    master_df.loc[i, "URL_MPV"], master_df.loc[i, "URL_FE"] = None, None
    master_df.loc[i, "date_discrepancy_days"] = 0

    # Flags as ommission
    master_df.loc[i, "omission_fe"] = True
    master_df.loc[i, "omission_mpv"] = True

Processing index 0 out of 240 (0.0%) to be added to master_df index 6864
Processing index 1 out of 240 (0.42%) to be added to master_df index 6865
Processing index 2 out of 240 (0.83%) to be added to master_df index 6866
Processing index 3 out of 240 (1.25%) to be added to master_df index 6867
Processing index 4 out of 240 (1.67%) to be added to master_df index 6868
Processing index 5 out of 240 (2.08%) to be added to master_df index 6869
Processing index 6 out of 240 (2.5%) to be added to master_df index 6870
Processing index 7 out of 240 (2.92%) to be added to master_df index 6871
Processing index 8 out of 240 (3.33%) to be added to master_df index 6872
Processing index 9 out of 240 (3.75%) to be added to master_df index 6873
Processing index 10 out of 240 (4.17%) to be added to master_df index 6874
Processing index 11 out of 240 (4.58%) to be added to master_df index 6875
Processing index 12 out of 240 (5.0%) to be added to master_df index 6876
Processing index 13 out of 240 (5.42%)

#### Loop III Statistics

In [67]:
# Removes the column that was added to WP as a placeholder
master_df.drop(columns=["URL"], inplace = True)

# Drops (now obsolete) "erroneous inclusion" criteria
master_df.drop(columns=["erroneous_inclusion_fe", "erroneous_inclusion_mpv", "erroneous_inclusion_wp" ], inplace = True)

In [68]:
# Print final search result statistics
print("Search Statistics (i.e., comparing every row of MPV to WP and FE_Suicidies")
print(f"Total Queries = {sum(search_results_dict.values()) }")
search_results_dict


Search Statistics (i.e., comparing every row of MPV to WP and FE_Suicidies
Total Queries = 13454


{'no_results_counter': 1862,
 '1_result_counter': 11321,
 '2_or_more_result_counter': 271}

In [69]:
# Number of civilians that were successfully matched by checking each row for omissions

# Calculates unmatched civilians from each of the 3 databases via omissions
fe_unmatched, mpv_unmatched, wp_unmatched, matched_between_at_least_two_databases = 0, 0, 0, 0
for i, ind_row in master_df.iterrows():
    if ind_row["omission_fe"] and ind_row["omission_mpv"] and ind_row["omission_wp"]:
        print(f"ERROR. Row {i} is recorded as having been omitted from all 3 databases. This shouldn't be possible: a fatally shot civillian should not be in this database if he/she doens't appear in any database")
        assert(False)
    elif ind_row["omission_mpv"] and ind_row["omission_wp"]:
        fe_unmatched += 1
    elif ind_row["omission_fe"] and ind_row["omission_wp"]:
        mpv_unmatched += 1
    elif ind_row["omission_fe"] and ind_row["omission_mpv"]:
        wp_unmatched += 1
    else:
        matched_between_at_least_two_databases += 1

#fe_original_size
#mpv_original_size
#wp_original_size 

# Prints results
print(f"Total: {matched_between_at_least_two_databases} successfully matched ({master_df.shape[0] - matched_between_at_least_two_databases} unmatched out of {master_df.shape[0]}) ({ 100 - round( (master_df.shape[0] - matched_between_at_least_two_databases) / master_df.shape[0] * 100, 2) }% match)")
print(f"Fatal Encounters: {fe_original_size - fe_unmatched} successfully matched ({fe_unmatched} unmatched out of {fe_original_size} total) ({ round( (fe_original_size - fe_unmatched) / fe_original_size * 100, 2) }% match)")
print(f"Mapping Police Violence: {mpv_original_size - mpv_unmatched} successfully matched ({mpv_unmatched} unmatched out of {mpv_original_size} total) ({ round( (mpv_original_size - mpv_unmatched) / mpv_original_size * 100, 2) }% match)")
print(f"Washington Post: {wp_original_size - wp_unmatched} successfully matched ({wp_unmatched} unmatched out of {wp_original_size} total) ({ round( (wp_original_size - wp_unmatched) / wp_original_size * 100, 2) }% match)")

Total: 6263 successfully matched (842 unmatched out of 7105) (88.15% match)
Fatal Encounters: 6159 successfully matched (431 unmatched out of 6590 total) (93.46% match)
Mapping Police Violence: 5893 successfully matched (170 unmatched out of 6063 total) (97.2% match)
Washington Post: 5705 successfully matched (241 unmatched out of 5946 total) (95.95% match)


In [70]:
# Verify WP additions
print(master_df.shape)
master_df.tail(10)

(7105, 45)


Unnamed: 0,date,victim_name,victim_age,victim_gender,victim_race,city_of_death,state_of_death,agency_responsible_1,agency_responsible_2,agency_responsible_3,agency_responsible_4,agency_responsible_5,agency_responsible_6,name_abr,agency_responsible_abr_1,agency_responsible_abr_2,agency_responsible_abr_3,agency_responsible_abr_4,agency_responsible_abr_5,agency_responsible_abr_6,city_of_death_abr,omission_fe,omission_mpv,omission_wp,date_fe,date_mpv,date_wp,city_fe,city_mpv,city_wp,gender_fe,gender_mpv,gender_wp,age_fe,age_mpv,age_wp,race_fe,race_mpv,race_wp,agency_responsible_fe,agency_responsible_mpv,agency_responsible_wp,date_discrepancy_days,URL_FE,URL_MPV
7095,2015-09-14 00:00:00,David Todd Powell,28,Male,White,Barstow,CA,Barstow Police Department,,,,,,david_powell,barstow_police,none,none,none,none,,barstow,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,,
7096,2015-08-29 00:00:00,James Marcus Brown,25,Male,Black,North Las Vegas,NV,Las Vegas Metropolitan Police Department,,,,,,james_brown,las_vegas_police,none,none,none,none,,north las vegas,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,,
7097,2015-08-12 00:00:00,Richard Dean Shull Jr.,24,Male,White,Odessa,TX,Odessa Police Department,Texas Rangers,,,,,richard_jr,odessa_police,texas_rangers,none,none,none,,odessa,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,,
7098,2015-05-20 00:00:00,Anthony Gomez,29,Male,Black,Lancaster,PA,Lancaster City Bureau of Police,,,,,,anthony_gomez,lancaster_city_police,none,none,none,none,,lancaster,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,,
7099,2015-04-18 00:00:00,Grover Sapp,45,Male,White,Panama City,FL,Panama City Police Department,,,,,,grover_sapp,panama_city_police,none,none,none,none,,panama city,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,,
7100,2015-04-15 00:00:00,Frank Shephard,41,Male,Black,Houston,TX,Houston Police Department,,,,,,frank_shephard,houston_police,none,none,none,none,,houston,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,,
7101,2015-04-09 00:00:00,Don Smith,29,Male,Black,Monon,IN,Tippecanoe County Sheriff's Department,Wasketa Police Department,Indiana State Police,,,,don_smith,tippecanoe_sheriff,wasketa_police,indiana_state_police,none,none,,monon,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,,
7102,2015-02-16 00:00:00,Lawrence Caldwell,56,Male,White,Marana,AZ,Marana Police Department,,,,,,lawrence_caldwell,marana_police,none,none,none,none,,marana,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,,
7103,2015-02-09 00:00:00,Desmond Luster,45,Male,Black,Dallas,TX,Dallas Police Department,,,,,,desmond_luster,dallas_police,none,none,none,none,,dallas,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,,
7104,2015-01-07 00:00:00,Ron Sneed,31,Male,Black,Freeport,TX,Freeport Police Department,,,,,,ron_sneed,freeport_police,none,none,none,none,,freeport,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,,


### 5.5 | Exports Our Manual Corrections Dataframe (if applicable)

If we made new manual corrections, i.e., if USE_MANUAL_MATHING_DATAFRAME was set to False, then saves these manual corrections to a dataframe so they can be replicated later.

In [71]:
if not USE_MANUAL_MATCHING_DATAFRAME:
    print("Saving our manual match dataframe")
    manual_match_verification_df = pd.read_csv(f"{MATCH_DIR}manual_matches.csv", index = False)
    manual_fuzzy_match_verification_df = pd.read_csv(f"{MATCH_DIR}fuzzy_search_name_matches.csv", index = False)
else:
    print("USE_MANUAL_MATHING_DATAFRAME was set to false. This means that you used the decisions contained in the pre-existing dataframe to handle manual matches.")
    print("Thus, no changes were made to the dataframe, hence, there is no need to save them.\nNo action is needed. Please proceed with the program.")

USE_MANUAL_MATHING_DATAFRAME was set to false. This means that you used the decisions contained in the pre-existing dataframe to handle manual matches.
Thus, no changes were made to the dataframe, hence, there is no need to save them.
No action is needed. Please proceed with the program.


# 6 | Export (i) DM-FS and (ii) a version of DM-FS that Needs Manual Corrections

"Needing manual corrections" is defined as any value in which the databases disagree.

### 6.1 | Aux Functions

In [72]:
# Prints the number of manual corrections that need to be undertaken, by column. Returns total manual corrections (int)
def print_manual_corrections(df, col):
    
    # Gets errors
    error_df = df[df[col].isin(["ERROR - 2 Databases Disagree", "ERROR - 3 Databases Disagree"])]
    error_counts = error_df[col].value_counts()

    # Prints them
    print(col)
    for index, count in error_counts.items():
            print(f"{index} : {count}")
    print(f"TOTAL = {len(error_df)}")

    # Returns (i) the total number of errors that need to be manually corrected and (ii) their indices
    return len(error_df), list(error_df.index)

### 6.2 | Explore number of manual corrections needed by column

In [73]:
# Prints all discrepencies & their total
grand_total = 0 # Records grand total number of discrepencies
error_indices = []

for ind_col in master_df.columns[0:8]:
    total, manual_correction_indices = print_manual_corrections(master_df, ind_col)
    grand_total += total
    error_indices.append(manual_correction_indices)
print(f"{'*'*20}\nTotal Manual Corrections = {grand_total}")

date
ERROR - 2 Databases Disagree : 50
ERROR - 3 Databases Disagree : 2
TOTAL = 52
victim_name
TOTAL = 0
victim_age
ERROR - 2 Databases Disagree : 74
ERROR - 3 Databases Disagree : 4
TOTAL = 78
victim_gender
ERROR - 2 Databases Disagree : 4
ERROR - 3 Databases Disagree : 1
TOTAL = 5
victim_race
ERROR - 2 Databases Disagree : 100
ERROR - 3 Databases Disagree : 20
TOTAL = 120
city_of_death
ERROR - 2 Databases Disagree : 111
ERROR - 3 Databases Disagree : 54
TOTAL = 165
state_of_death
TOTAL = 0
agency_responsible_1
ERROR - 3 Databases Disagree : 141
ERROR - 2 Databases Disagree : 81
TOTAL = 222
********************
Total Manual Corrections = 642


### 6.3 | Create dataframe of just manual corrections

In [74]:
# Cleans error indices so it's (i) one list (vs. lists of indices-lists) with (ii) no empty lists and (iii) only unique indices
error_indices = [sublist for sublist in error_indices if sublist]  # Remove empty lists
error_indices = [item for sublist in error_indices for item in sublist]  # Flatten the list of lists
error_indices = sorted(list(set(error_indices)))  # Keep only unique values

print(f"Rows that need to undergo manual review = {len(error_indices)}")

Rows that need to undergo manual review = 552


In [75]:
manual_correction_df = master_df.loc[error_indices]
manual_correction_df.head(5)

Unnamed: 0,date,victim_name,victim_age,victim_gender,victim_race,city_of_death,state_of_death,agency_responsible_1,agency_responsible_2,agency_responsible_3,agency_responsible_4,agency_responsible_5,agency_responsible_6,name_abr,agency_responsible_abr_1,agency_responsible_abr_2,agency_responsible_abr_3,agency_responsible_abr_4,agency_responsible_abr_5,agency_responsible_abr_6,city_of_death_abr,omission_fe,omission_mpv,omission_wp,date_fe,date_mpv,date_wp,city_fe,city_mpv,city_wp,gender_fe,gender_mpv,gender_wp,age_fe,age_mpv,age_wp,race_fe,race_mpv,race_wp,agency_responsible_fe,agency_responsible_mpv,agency_responsible_wp,date_discrepancy_days,URL_FE,URL_MPV
3,2020-12-30 00:00:00,David Jacob Rigsby,39,Male,ERROR - 2 Databases Disagree,Waverly,TN,Waverly Police Department,Humphreys County Sheriff's Office,,,,,david_rigsby,waverly_police,humphreys_sheriff,,,,,waverly,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,Unknown,False,White,False,False,False,0,https://www.newschannel5.com/news/tbi-investig...,
10,2020-12-29 00:00:00,Samuel Lazaro,18,Male,ERROR - 2 Databases Disagree,ERROR - 2 Databases Disagree,NY,City of New York Police Department,,,,,,samuel_lazaro,city_of_new_york_police,,,,,,brooklyn,False,True,False,False,False,False,Brooklyn,False,New York,False,False,False,False,False,False,Unknown,False,Hispanic,False,False,False,0,https://www.radio.com/1010wins/news/local/nypd...,
19,2020-12-28 00:00:00,Not Disclosed,-1,ERROR - 2 Databases Disagree,Unknown,Holyoke,CO,Holyoke Police Department,,,,,,not_disclosed,holyoke_police,,,,,,holyoke,False,False,True,False,False,False,False,False,False,Unknown,Male,False,False,False,False,False,False,False,False,False,False,0,https://denver.cbslocal.com/2020/12/30/holyoke...,https://denver.cbslocal.com/2020/12/30/holyoke...
44,2020-12-18 00:00:00,Not Disclosed,ERROR - 2 Databases Disagree,Male,Unknown,Lucerne Valley,CA,San Bernardino County Sheriff's Office,,,,,,not_disclosed,san_bernardino_sheriff,,,,,,lucerne valley,False,True,False,False,False,False,False,False,False,False,False,False,-1,False,65,False,False,False,False,False,False,0,https://www.sbsun.com/2020/12/19/deputies-shoo...,
45,2020-12-17 00:00:00,Andrew Mansilla,25,Male,ERROR - 2 Databases Disagree,Daytona Beach,FL,Daytona Beach Police Department,,,,,,andrew_mansilla,daytona_beach_police,,,,,,daytona beach,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,White,False,Hispanic,False,False,False,0,https://www.clickorlando.com/news/local/2020/1...,


### 6.4 | Export both dataframes 

In [76]:
CORRECTIONS_DIR = "res/1.2 - DM-FS Before Manual Corrections/"
manual_correction_df.to_csv(f"{CORRECTIONS_DIR}manual_correction_to_make.csv", index = True)
master_df.to_csv(f"{CORRECTIONS_DIR}DMFS_no_manual_corrections.csv", index = True)