In [2]:
import pandas as pd
from data_tweak.converters import convert_race, convert_sex
base_file_path = "../files"
out_file_path = base_file_path + "/events/officer_id_merged/stops/filter_1/kiefer"

officer_prof_df = pd.read_csv(base_file_path + "/profiles/officers_index_plus_kiefer.csv")
stops_df = pd.read_csv(base_file_path + "/events/stops_full.csv")
stops_df.index.name = "stops_index"
stops_df.reset_index(inplace=True)

print("Number of stops to be processed: ", stops_df.shape[0])

join_left_cols = ["first_off_first_name", "first_off_last_name", "first_off_sex", "first_off_race"]
join_right_cols = ["off_first_name", "off_last_name", "off_sex", "off_race"]

stops_df.dropna(axis="index", how="any", subset=join_left_cols, inplace=True)
officer_prof_df.dropna(axis="index", how="any", subset=join_right_cols, inplace=True)

convert_sex(stops_df, ["first_off_sex", "second_off_sex"])
convert_race(stops_df, ["first_off_race", "second_off_race"])

# Just do in the profile generation
convert_sex(officer_prof_df, ["off_sex"])
convert_race(officer_prof_df, ["off_race"])

out_cols = list(stops_df.columns.values.tolist()) + ["off_uniq_id"]
stops_merged_df = pd.merge(stops_df, officer_prof_df, how="left", left_on=join_left_cols, right_on=join_right_cols, suffixes=("_stop", "_first_off"), validate="m:m")
stops_merged_df.loc[:, out_cols].to_csv(out_file_path + "/half_merged.csv", index=False)
print("initial merge rows", stops_merged_df.shape[0])
join_left_cols = ["second_off_first_name", "second_off_last_name", "second_off_sex", "second_off_race"]
# second_off_stops = stops_merged_df.dropna(axis="index", how="any", subset=join_left_cols)
full_merge = pd.merge(stops_merged_df, officer_prof_df, how="left", left_on=join_left_cols, right_on=join_right_cols, suffixes=("", "_second_off"), validate="m:m")
# full_merge = pd.merge(stops_merged_df, second_off_stops_merged_df, how="left")
out_cols = out_cols + ["off_uniq_id_second_off"]
full_merge.loc[:, out_cols].to_csv(out_file_path + "/full_merge_attempt.csv", index=False)
print("full_merge rows", full_merge.shape[0])
full_merge[full_merge["off_uniq_id"].isna()].loc[:, out_cols].to_csv(out_file_path + "/missing_1st_off.csv", index=False)
print("number of rows w/ a 2nd off", full_merge[full_merge["second_off_first_name"].notna()].shape[0])
full_merge[full_merge["second_off_first_name"].notna() & full_merge["off_uniq_id_second_off"].isna()].loc[:, out_cols].to_csv(out_file_path + "/missing_2nd_off.csv", index=False)
matches_by_stops_index = full_merge.groupby("stops_index").size().reset_index(name="count")
stops_index_multiple = matches_by_stops_index[matches_by_stops_index["count"] > 1]["stops_index"].to_numpy()
row_mask = full_merge.isin({'stops_index': stops_index_multiple}).any(1)
full_merge[row_mask].loc[:, out_cols].to_csv(out_file_path + "/multiple_matches.csv", index=False)


  stops_df = pd.read_csv(base_file_path + "/events/stops_full.csv")


Number of stops to be processed:  2865566
initial merge rows 2524993
full_merge rows 2827393
number of rows w/ a 2nd off 2514988


In [4]:

""" Take out missing matches, and duplicate matches so we have a decent set of data """
stops_index_single = matches_by_stops_index[matches_by_stops_index["count"] == 1]["stops_index"].to_numpy()
row_mask = full_merge.isin({'stops_index': stops_index_single}).any(1)
only_matches = full_merge[row_mask]
only_matches = only_matches[only_matches["off_uniq_id"].notna()]

only_matches = only_matches[(only_matches["second_off_first_name"].isna()) | (only_matches["second_off_first_name"].notna() & only_matches["off_uniq_id_second_off"].notna())]
print("1-to-1 matches found:", only_matches.shape[0])
only_matches.loc[:, out_cols].to_csv(base_file_path + "/events/officer_id_merged/stops/filter_1/kiefer/1_to_1_merges.csv")

# full_merge[full_merge.duplicated(subset=["stops_index"])].to_csv(base_file_path + "/events/officer_id_merged/stops_merged_filter_1_multiple_match.csv")


1-to-1 matches found: 1805218
