In [119]:
import pandas as pd
import numpy as np

cols = ["date_time", "time", "date", "add_of_arrest", "officer_role", "off_first", "off_last", "off_star", "fbi_code", "statute", "civ_sex", "civ_race", "civ_age", "civ_first_name", "civ_last_name"]

out_file_path = "../files/events/officer_id_merged/arrests/filter_1"
arrests_df = pd.read_csv("../files/events/arrest_file.csv", parse_dates=["date_time"])

print("Total arrest records", arrests_df.shape[0])

""" Arrests are duplicated by fbi_code. Filter out the dupes """
group_key = ["date_time", "time", "date", "add_of_arrest", "officer_role", "off_first", "off_last", "civ_sex", "civ_race", "civ_age", "civ_first_name", "civ_last_name"]
grouped_arrests = arrests_df.groupby(by=group_key, dropna=False).agg(
  off_star=("off_star", "first"),
  fbi_codes=("fbi_code", lambda codes: ",".join(codes.unique())),
  statutes=("statute", lambda statutes: ",".join(statutes.unique())),
  charges= ("fbi_code", lambda codes: codes.unique().shape[0])
)

grouped_arrests = grouped_arrests.reset_index().sort_values(by="date_time")
grouped_arrests.to_csv(out_file_path + "/arrests_deduped.csv")

  arrests_df = pd.read_csv("../files/events/arrest_file.csv", parse_dates=["date_time"])


Total arrest records 4853961


In [120]:
print("Arrest records after being grouped by arresting officer", grouped_arrests.shape[0])
print("Arrest records without an officer star", grouped_arrests[grouped_arrests["off_star"].isna()].shape[0])
grouped_arrests.index.name = "arrest_id"

grouped_arrests = grouped_arrests.reset_index()


Arrest records after being grouped by arresting officer 2682060
Arrest records without an officer star 1324


In [121]:
grouped_arrests[grouped_arrests["off_star"].isna()].to_csv(out_file_path + "/no_off_star.csv", index=False)
grouped_arrests = grouped_arrests.dropna(subset=["off_star"])
redacted_arrests = grouped_arrests[(grouped_arrests["off_first"] == "Redacted") | (grouped_arrests["off_last"] == "Redacted") | (grouped_arrests["off_star"] == "Redacted")]
redacted_arrests.to_csv(out_file_path + "/redacted_arrests.csv", index=False)
redacted_arrest_ids = redacted_arrests["arrest_id"].to_numpy()
print("Number of redacted arrest records", redacted_arrest_ids.size)
unredacted_row_mask = ~grouped_arrests.isin({"arrest_id": redacted_arrest_ids}).any(1)
unredacted_arrests = grouped_arrests[unredacted_row_mask]
unredacted_arrests.to_csv(out_file_path + "/unredacted_arrests.csv", index=False)
print("Number of un-redacted arrest records", unredacted_arrests.shape[0])
unredacted_arrests.loc[:, "off_star"] = pd.to_numeric(unredacted_arrests["off_star"])


Number of redacted arrest records 2735
Number of un-redacted arrest records 2678001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unredacted_arrests.loc[:, "off_star"] = pd.to_numeric(unredacted_arrests["off_star"])


In [122]:
""" @TODO just write the correct dtype to the roster file """
officer_roster = pd.read_csv("../files/profiles/officers_index_step_4.csv", dtype={"off_star_0": np.float64})

left_on = ["off_first", "off_last", "off_star"]
right_on = ["off_first_name", "off_last_name"]
right_star_field_stub = "off_star_"

star_merges = []
out_cols = list(unredacted_arrests.columns.tolist()) + ["merged_by_star", "off_uniq_id"]

for i in range(6):
  off_star_field = right_star_field_stub + str(i)
  star_merge = pd.merge(unredacted_arrests, officer_roster, left_on=left_on, right_on=right_on + [off_star_field], how="left")
  star_merge.dropna(subset="off_uniq_id", inplace=True)
  star_merge.to_csv(out_file_path + "/star_{}_merge.csv".format(i))
  star_merge.loc[:, "merged_by_star"] = i
  star_merges.append(star_merge.loc[:, out_cols])

full_star_merge = pd.concat(star_merges, ignore_index=True)
full_star_merge.sort_values(by="arrest_id", inplace=True, ignore_index=True)
print("Number of records merged by officer stars", full_star_merge.shape[0])

full_star_merge.to_csv(out_file_path + "/all_star_merges.csv")

Number of records merged by officer stars 2592163


In [None]:
grouped = full_star_merge.groupby("arrest_id").size().sort_values(ascending=False).to_frame("size")
ids_with_multiple_matches = grouped[grouped["size"] > 1].index.to_numpy()
row_mask = full_star_merge.isin({"arrest_id": ids_with_multiple_matches}).any(1)
multiple_matches = full_star_merge[row_mask]


In [127]:
multiple_matches.to_csv(out_file_path + "/matched_to_multiple_officers.csv", index=False)

single_matches = full_star_merge[~row_mask]
single_matches.to_csv(out_file_path + "/matched_to_single_officer.csv", index=False)

In [128]:
# right_on = ["off_first_name", "off_last_name", "off_star_1"]
# star_1_merge = pd.merge(unmatched_star_0, officer_roster, left_on=left_on, right_on=right_on, how="left")
# unmatched_star_1 = star_1_merge[star_1_merge["off_uniq_id"].isna()].loc[:, unredacted_arrests.columns]

In [129]:
# right_on = ["off_first_name", "off_last_name", "off_star_2"]
# star_2_merge = pd.merge(unmatched_star_1, officer_roster, left_on=left_on, right_on=right_on, how="left")
# star_2_merge[star_2_merge["off_uniq_id"].isna()]