In [2]:
import pandas as pd
import numpy as np

cols = ["date_time", "time", "date", "add_of_arrest", "officer_role", "off_first", "off_last", "off_star", "fbi_code", "statute", "civ_sex", "civ_race", "civ_age", "civ_first_name", "civ_last_name"]

out_file_path = "../files/events/officer_id_merged/arrests/filter_1"
arrests_df = pd.read_csv("../files/events/arrest_file.csv", parse_dates=["date_time"])

print("Total arrest records", arrests_df.shape[0])

""" Arrests are duplicated by fbi_code. Filter out the dupes """
group_key = ["date_time", "time", "date", "add_of_arrest", "officer_role", "off_first", "off_last", "civ_sex", "civ_race", "civ_age", "civ_first_name", "civ_last_name"]
grouped_arrests = arrests_df.groupby(by=group_key, dropna=False).agg(
  off_star=("off_star", "first"),
  fbi_codes=("fbi_code", lambda codes: ",".join(codes.unique())),
  statutes=("statute", lambda statutes: ",".join(statutes.unique())),
  charges= ("fbi_code", lambda codes: codes.unique().shape[0])
)

grouped_arrests = grouped_arrests.reset_index().sort_values(by="date_time")
grouped_arrests.to_csv(out_file_path + "/arrests_deduped.csv")

  arrests_df = pd.read_csv("../files/events/arrest_file.csv", parse_dates=["date_time"])


Total arrest records 4853961


In [3]:
print("Arrest records after being grouped by arresting officer", grouped_arrests.shape[0])
print("Arrest records without an officer star", grouped_arrests[grouped_arrests["off_star"].isna()].shape[0])
grouped_arrests.index.name = "arrest_id"

grouped_arrests = grouped_arrests.reset_index()


Arrest records after being grouped by arresting officer 2682060
Arrest records without an officer star 1324


In [4]:
grouped_arrests[grouped_arrests["off_star"].isna()].to_csv(out_file_path + "/no_off_star.csv", index=False)
grouped_arrests = grouped_arrests.dropna(subset=["off_star"])
redacted_arrests = grouped_arrests[(grouped_arrests["off_first"] == "Redacted") | (grouped_arrests["off_last"] == "Redacted") | (grouped_arrests["off_star"] == "Redacted")]
redacted_arrests.to_csv(out_file_path + "/redacted_arrests.csv", index=False)
redacted_arrest_ids = redacted_arrests["arrest_id"].to_numpy()
print("Number of redacted arrest records", redacted_arrest_ids.size)
unredacted_row_mask = ~grouped_arrests.isin({"arrest_id": redacted_arrest_ids}).any(1)
unredacted_arrests = grouped_arrests[unredacted_row_mask]
unredacted_arrests.to_csv(out_file_path + "/unredacted_arrests.csv", index=False)
print("Number of un-redacted arrest records", unredacted_arrests.shape[0])
unredacted_arrests.loc[:, "off_star"] = pd.to_numeric(unredacted_arrests["off_star"])


Number of redacted arrest records 2735
Number of un-redacted arrest records 2678001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unredacted_arrests.loc[:, "off_star"] = pd.to_numeric(unredacted_arrests["off_star"])


In [5]:
""" @TODO just write the correct dtype to the roster file """
officer_roster = pd.read_csv("../files/profiles/officer_roster.csv", dtype={"off_star_0": np.float64})

left_on = ["off_first", "off_last", "off_star"]
right_on = ["off_first_name", "off_last_name"]
right_star_field_stub = "off_star_"

star_merges = []
out_cols = list(unredacted_arrests.columns.tolist()) + ["merged_by_star", "off_uniq_id"]

for i in range(6):
  off_star_field = right_star_field_stub + str(i)
  star_merge = pd.merge(unredacted_arrests, officer_roster, left_on=left_on, right_on=right_on + [off_star_field], how="left")
  # drop unmatched records. But we need to merge them back in eventually
  star_merge.dropna(subset="off_uniq_id", inplace=True)
  star_merge.to_csv(out_file_path + "/star_{}_merge.csv".format(i))
  star_merge.loc[:, "merged_by_star"] = i
  star_merges.append(star_merge.loc[:, out_cols])

full_star_merge = pd.concat(star_merges, ignore_index=True)
full_star_merge.sort_values(by="arrest_id", inplace=True, ignore_index=True)
print("Number of records merged by officer stars", full_star_merge.shape[0])

full_star_merge.to_csv(out_file_path + "/matched_by_off_star.csv")

Number of records merged by officer stars 2592163


In [6]:
grouped = full_star_merge.groupby("arrest_id").size().sort_values(ascending=False).to_frame("size")
ids_with_multiple_matches = grouped[grouped["size"] > 1].index.to_numpy()
row_mask = full_star_merge.isin({"arrest_id": ids_with_multiple_matches}).any(1)
multiple_matches = full_star_merge[row_mask]


In [7]:
multiple_matches.to_csv(out_file_path + "/matched_to_multiple_officers.csv", index=False)

single_matches = full_star_merge[~row_mask]
single_matches.to_csv(out_file_path + "/matched_to_single_officer.csv", index=False)

In [8]:
from metrics.analyze_df import matches, unmatched
import numpy as np

duplicates_remaining = multiple_matches.sort_values(by="date_time")
duplicates_remaining["off_uniq_id"] = duplicates_remaining.loc[:, "off_uniq_id"].astype(np.int64)
duplicates_remaining["date_time"] = pd.to_datetime(duplicates_remaining["date_time"])
original_cols = duplicates_remaining.columns
confirmed_match_dfs = []
i = 0

with pd.read_csv("../files/events/officer_id_merged/assignments/assignments_full_merge.csv", chunksize=10000) as reader:
  for chunk in reader:
    """
      In the arrests file, we have officer last, first, and star. 
      With this limited matching criteria, we have a number of duplicate matches that
      we can attempt to weed out using the assignment file. The idea is that given
      an arrest matched with both off_uniq_id_a and off_uniq_id_b, we attempt to
      find an assignment record corresponding to that officer by off_uniq_id and off_star.
      If we can't find a record of a given officer in the assignments file, then we
      ignore that match
    """
    print("processing chunk", i)
    i += 1
    chunk.dropna(subset=["shift_end_corrected"], inplace=True)
    chunk["shift_end_corrected"] = pd.to_datetime(chunk["shift_end_corrected"], format="%Y-%m-%d %H:%M:%S", utc=True)
    chunk.sort_values(by="shift_end_corrected", inplace=True)
    merged_chunk = pd.merge(duplicates_remaining, chunk, on=["off_uniq_id", "off_star"], suffixes=("", "_assignments"), how="left").drop_duplicates(subset=["arrest_id", "off_uniq_id"])
    confirmed_match_dfs.append(matches(merged_chunk, signifier="off_appointed"))
    duplicates_remaining = unmatched(merged_chunk, signifier="off_appointed").loc[:, original_cols]

confirmed_matches = pd.concat(confirmed_match_dfs)

processing chunk 0
Based on the join key []:
	Number of matches: 7657
Based on the join key []:
	Number of unmatched records: 18858
processing chunk 1
Based on the join key []:
	Number of matches: 939
Based on the join key []:
	Number of unmatched records: 17919
processing chunk 2
Based on the join key []:
	Number of matches: 3
Based on the join key []:
	Number of unmatched records: 17916
processing chunk 3
Based on the join key []:
	Number of matches: 195
Based on the join key []:
	Number of unmatched records: 17721
processing chunk 4
Based on the join key []:
	Number of matches: 0
Based on the join key []:
	Number of unmatched records: 17721
processing chunk 5
Based on the join key []:
	Number of matches: 0
Based on the join key []:
	Number of unmatched records: 17721
processing chunk 6
Based on the join key []:
	Number of matches: 0
Based on the join key []:
	Number of unmatched records: 17721
processing chunk 7
Based on the join key []:
	Number of matches: 0
Based on the join key [

In [12]:
matched_arrest_ids = full_star_merge["arrest_id"].to_numpy()
rmask = ~grouped_arrests.isin({"arrest_id": matched_arrest_ids}).any(1)
missing_arrests = grouped_arrests[rmask]
missing_arrests

Unnamed: 0,arrest_id,date_time,time,date,add_of_arrest,officer_role,off_first,off_last,civ_sex,civ_race,civ_age,civ_first_name,civ_last_name,off_star,fbi_codes,statutes,charges
21,23,2014-01-01 00:05:00+00:00,0,2014-01-01,31XX W WALNUT ST,Second Arresting Officer,CLIFFORD,HALL,MALE,BLACK,61.0,ARTHUR,ROBERTS,12115,15,720 ILCS 5.0/24-1.5-A,1
27,22,2014-01-01 00:05:00+00:00,0,2014-01-01,31XX W WALNUT ST,First Arresting Officer,MARIO,CRUZ,MALE,BLACK,61.0,ARTHUR,ROBERTS,16659,15,720 ILCS 5.0/24-1.5-A,1
56,86,2014-01-01 00:10:00+00:00,0,2014-01-01,52XX S INDIANA AVE,Second Arresting Officer,FRANK,RAMAGLIA,MALE,BLACK,26.0,RODNEY,HILL,1775,15,720 ILCS 5.0/24-1.1-A,1
79,111,2014-01-01 00:10:00+00:00,0,2014-01-01,64XX S WHIPPLE ST,Second Arresting Officer,ROBERT,GALLAS,FEMALE,WHITE HISPANIC,27.0,JESSICA,JUAREZ,17815,26,720 ILCS 5.0/31-1-A,1
80,112,2014-01-01 00:10:00+00:00,0,2014-01-01,64XX S WHIPPLE ST,Second Arresting Officer,ROBERT,GALLAS,MALE,WHITE HISPANIC,21.0,JAMIE,OROZCO,17815,15,"720 ILCS 5.0/24-1.5-A,720 ILCS 5.0/24-1.6-A-2,...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2681832,2681832,2021-08-18 16:40:00+00:00,16,2021-08-18,62XX W NORTH AVE,First Arresting Officer,PIERRE,PARGO,MALE,BLACK,27.0,TYRON,BROOKS,15096.0,"15,WRT","720 ILCS 5.0/24-1.1-A,725 ILCS 5.0/110-3",2
2681871,2681873,2021-08-18 23:00:00+00:00,23,2021-08-18,49XX W MADISON ST,First Arresting Officer,NICHOLAS,CARTER,FEMALE,BLACK,29.0,BESSIE,CONNERS,15536.0,26,720 ILCS 5.0/21-3-A-1,1
2681949,2681949,2021-08-19 10:40:00+00:00,10,2021-08-19,51XX N MILWAUKEE AVE,Second Arresting Officer,ANGELA,OLIFER,FEMALE,WHITE,48.0,RENATA,FLIG,13640,26,510 ILCS 70.0/3.01,1
2681960,2681960,2021-08-19 11:47:00+00:00,11,2021-08-19,28XX W IRVING PARK RD,Assisting Arresting Officer,NIKI,TEWS,MALE,WHITE,61.0,THOMAS,HIGHAM,1543.0,"26,08A","720 ILCS 5.0/21-3-A-2,720 ILCS 5.0/12-1-A",2


In [17]:
confirmed_arrest_ids = confirmed_matches.loc[:, "arrest_id"].to_numpy()
rmask = ~duplicates_remaining.isin({"arrest_id": confirmed_arrest_ids}).any(1)
duplicates_remaining[rmask]



Unnamed: 0,arrest_id,date_time,time,date,add_of_arrest,officer_role,off_first,off_last,civ_sex,civ_race,civ_age,civ_first_name,civ_last_name,off_star,fbi_codes,statutes,charges,merged_by_star,off_uniq_id


In [20]:

all_arrests = pd.concat([single_matches, confirmed_matches, missing_arrests], ignore_index=True).loc[:, out_cols].sort_values(by="arrest_id", ignore_index=True)
all_arrests.drop_duplicates(subset=["arrest_id", "off_uniq_id"]).groupby("arrest_id").size().sort_values(ascending=False)

arrest_id
2578350    2
2078005    2
2250078    2
1482133    2
1867388    2
          ..
894329     1
894330     1
894331     1
894332     1
2682059    1
Length: 2680736, dtype: int64

In [11]:
# right_on = ["off_first_name", "off_last_name", "off_star_2"]
# star_2_merge = pd.merge(unmatched_star_1, officer_roster, left_on=left_on, right_on=right_on, how="left")
# star_2_merge[star_2_merge["off_uniq_id"].isna()]