In [13]:
import pandas as pd
from data_tweak.converters import convert_race, convert_sex
base_file_path = "../files"
out_file_path = base_file_path + "/events/officer_id_merged/stops/filter_v2"

officer_prof_df = pd.read_csv(base_file_path + "/profiles/officers_index_plus_kiefer.csv")
stops_df = pd.read_csv(base_file_path + "/events/stops_by_fo_birth_year.csv", index_col="stop_id")
stops_df.reset_index(inplace=True)

print("number of profiles:", officer_prof_df.shape[0])
officer_prof_df = officer_prof_df.dropna(subset=["off_year_of_birth"]).sort_values("off_year_of_birth")
print("number of profiles with a birth year:", officer_prof_df.shape[0])
print("number of stops:", stops_df.shape[0])

join_left_cols = ["first_off_first_name", "first_off_last_name", "first_off_sex", "first_off_race"]
join_right_cols = ["off_first_name", "off_last_name", "off_sex", "off_race"]
age_cols = ["first_off_age", "first_off_latest_birth_year"]

stops_df[stops_df[join_left_cols].isna().any(axis=1)].to_csv(out_file_path + "/first_off_key_missing.csv", index=False)

stops_df.dropna(axis="index", how="any", subset=join_left_cols + age_cols, inplace=True)
print("number of stops with a birth year:", stops_df.shape[0])
officer_prof_df.dropna(axis="index", how="any", subset=join_right_cols, inplace=True)

print("number of stops after dropping empty keys", stops_df.shape[0])
print("number of officers after dropping empty keys", officer_prof_df.shape[0])

convert_sex(stops_df, ["first_off_sex", "second_off_sex"])
convert_race(stops_df, ["first_off_race", "second_off_race"])

# Just do in the profile generation
convert_sex(officer_prof_df, ["off_sex"])
convert_race(officer_prof_df, ["off_race"])


  stops_df = pd.read_csv(base_file_path + "/events/stops_by_fo_birth_year.csv", index_col="stop_id")


number of profiles: 36323
number of profiles with a birth year: 36321
number of stops: 2865566
number of stops with a birth year: 2258098
number of stops after dropping empty keys 2258098
number of officers after dropping empty keys 35813


In [14]:

out_cols = list(stops_df.columns.values.tolist()) + ["off_uniq_id"]
stops_merged_df = pd.merge_asof(stops_df, officer_prof_df, left_on=["first_off_latest_birth_year"], right_on=["off_year_of_birth"], left_by=join_left_cols, right_by=join_right_cols, tolerance=1)
stops_merged_df.loc[:, out_cols].to_csv(out_file_path + "/half_merged.csv", index=False)
print("initial merge rows", stops_merged_df.shape[0])
print("rows with an uid for the first officer", stops_merged_df[stops_merged_df["off_uniq_id"].notna()].shape[0])


['stop_id', 'date_time', 'stop_address', 'district', 'beat', 'contact_type', 'first_off_last_name', 'first_off_first_name', 'first_off_sex', 'first_off_race', 'first_off_age', 'second_off_last_name', 'second_off_first_name', 'second_off_sex', 'second_off_race', 'second_off_age', 'civ_sex', 'civ_race', 'civ_age', 'date', 'time', 'contact_card', 'first_officer_id', 'second_officer_id', 'search', 'pat_down', 'first_off_latest_birth_year', 'second_off_latest_birth_year', 'off_uniq_id']
initial merge rows 2258098
rows with an uid for the first officer 2171873


In [15]:
join_left_cols = ["second_off_first_name", "second_off_last_name", "second_off_sex", "second_off_race"]
age_cols = ["second_off_age", "second_off_latest_birth_year"]
""" @TODO output rejected second off rows """
stops_merged_df[stops_merged_df[join_left_cols + age_cols].isna().any(axis=1)].to_csv(out_file_path + "/second_off_key_missing.csv", index=False)

In [16]:

second_off_stops = stops_merged_df.sort_values("second_off_latest_birth_year").dropna(subset=join_left_cols, how="any")
print("processing stops with a second officer", second_off_stops.shape[0])
second_off_stops_merged = pd.merge_asof(second_off_stops, officer_prof_df, left_on=["second_off_latest_birth_year"], right_on=["off_year_of_birth"], left_by=join_left_cols, right_by=join_right_cols, suffixes=("", "_second_off"), tolerance=1)
out_cols = list(stops_df.columns.values.tolist()) + ["off_uniq_id", "off_uniq_id_second_off"]
second_off_stops_merged.loc[:, out_cols].to_csv(out_file_path + "/second_off_stops_merged.csv", index=False)
print("number of stops matched with a second officer", second_off_stops_merged[second_off_stops_merged["off_uniq_id_second_off"].notna()].shape[0])


processing stops with a second officer 1978911
number of stops matched with a second officer 1899089


In [18]:
""" Consider, this chooses the closest match looking backwards in terms of birth date. Is this what we want? """

full_merge = pd.concat([second_off_stops_merged, stops_merged_df])
full_merge = full_merge.drop_duplicates(subset=["stop_id"])
full_merge.loc[:, out_cols].to_csv(out_file_path + "/full_merge.csv", index=False)
print("Total number of rows in the full merge", full_merge.shape[0])
print("Total number of rows missing a UID for the first officer", full_merge[full_merge["off_uniq_id"].isna()].shape[0])
full_merge[full_merge["off_uniq_id"].isna()].to_csv(out_file_path + "/missing_fo_id.csv", index=False)
print("Total number of rows missing a UID for the second officer", full_merge[full_merge["second_off_first_name"].notna() & full_merge["off_uniq_id_second_off"].isna()].shape[0])
full_merge[full_merge["second_off_first_name"].notna() & full_merge["off_uniq_id_second_off"].isna()].to_csv(out_file_path + "/missing_so_id.csv", index=False)



Total number of rows in the full merge 2258098
Total number of rows missing a UID for the first officer 86225
Total number of rows missing a UID for the second officer 79822


In [19]:
print("Total number of rows missing a UID for just the second officer", full_merge[full_merge["off_uniq_id"].notna() & full_merge["second_off_first_name"].notna() & full_merge["off_uniq_id_second_off"].isna()].shape[0])

Total number of rows missing a UID for just the second officer 62935
