In [57]:
import pandas as pd
import numpy as np
from data_tweak.converters import convert_race, convert_sex
from metrics.analyze_df import matches, unmatched, dupes

out_file_path = "../files/events/officer_id_merged/uof/filter_1"
original_uof_df = pd.read_csv("../files/events/uof_full.csv")
original_cols = list(original_uof_df.columns.to_list())
out_cols = original_cols + ["off_uniq_id", "off_original_last_name", "notes"]
profs_df = pd.read_csv("../files/profiles/officer_roster.csv")

convert_race(original_uof_df, ["off_race", "civ_race"])
convert_sex(original_uof_df, ["off_sex", "civ_sex"])

profs_df = profs_df.rename(columns={"off_year_of_birth": "off_birth_year"})

missing_names = original_uof_df[original_uof_df[["off_first_name", "off_last_name"]].isna().all(axis=1)]
missing_names.to_csv(out_file_path + "/missing_names.csv")

print("Number of UoF records", original_uof_df.shape[0])
print("number of missing first/last names", missing_names.shape[0])

""" merge these using the assignments file """
uof_df = original_uof_df.dropna(subset=["off_first_name", "off_last_name"], how="all")

""" combine suffixes if they exist """
uof_df.loc[:, "off_original_last_name"] = uof_df.loc[:, "off_last_name"]
uof_df.loc[:, "off_last_name"] = np.where(uof_df["off_suffix"].notna(), uof_df["off_last_name"] + " " + uof_df["off_suffix"], uof_df["off_last_name"])

join_key_1 = ["off_first_name", "off_last_name", "off_middle_initial", "off_sex", "off_race", "off_birth_year"]

print("merging on [" + ", ".join(join_key_1) + "]")
merged_df = pd.merge(uof_df, profs_df, on=join_key_1, how="left", suffixes=("", "_prof"))
successfully_merged = matches(merged_df, "off_uniq_id", merged_on=join_key_1)
successfully_merged.loc[:,"notes"] = "merged on [" + ", ".join(join_key_1) + "] with off_suffix appended"

unjoined = unmatched(merged_df, "off_uniq_id", merged_on=join_key_1).loc[:, uof_df.columns]
# unjoined.to_csv(out_file_path + "/unjoined.csv")

multi_matches = dupes(successfully_merged, "uof_id", merged_on=join_key_1)

  original_uof_df = pd.read_csv("../files/events/uof_full.csv")


Number of UoF records 91955
number of missing first/last names 10477
merging on [off_first_name, off_last_name, off_middle_initial, off_sex, off_race, off_birth_year]
Based on the join key [off_first_name, off_last_name, off_middle_initial, off_sex, off_race, off_birth_year]:
	Number of matches: 65956
Based on the join key [off_first_name, off_last_name, off_middle_initial, off_sex, off_race, off_birth_year]:
	Number of unmatched records: 15561
Based on the join key [off_first_name, off_last_name, off_middle_initial, off_sex, off_race, off_birth_year]:
	Number of records with more than one match: 39


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uof_df.loc[:, "off_original_last_name"] = uof_df.loc[:, "off_last_name"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uof_df.loc[:, "off_last_name"] = np.where(uof_df["off_suffix"].notna(), uof_df["off_last_name"] + " " + uof_df["off_suffix"], uof_df["off_last_name"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

In [58]:
join_key_2 = ["off_first_name", "off_last_name", "off_sex", "off_race", "off_birth_year"]
second_merge_df = pd.merge(unjoined, profs_df, on=join_key_2, how="left", suffixes=("", "_prof"))

second_merge_successes = matches(second_merge_df, "off_uniq_id", merged_on=join_key_2)
second_merge_successes["notes"] = "merged on [" + ", ".join(join_key_2) + "] with off_suffix appended"
successfully_merged = pd.concat([successfully_merged, second_merge_successes])

dupes_so_far = dupes(successfully_merged, "uof_id")

second_merge_unmatched = unmatched(second_merge_df, "off_uniq_id", merged_on=join_key_2)
second_merge_unmatched.loc[:, join_key_2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  second_merge_successes["notes"] = "merged on [" + ", ".join(join_key_2) + "] with off_suffix appended"


Based on the join key [off_first_name, off_last_name, off_sex, off_race, off_birth_year]:
	Number of matches: 14358
Based on the join key []:
	Number of records with more than one match: 63
Based on the join key [off_first_name, off_last_name, off_sex, off_race, off_birth_year]:
	Number of unmatched records: 1227


Unnamed: 0,off_first_name,off_last_name,off_sex,off_race,off_birth_year
38,RODNEY,JACKSON,MALE,BLACK,1964.0
145,KENNETH,GALVIN,MALE,BLACK,1960.0
204,MARK,HERNANDEZ,MALE,HISPANIC,1974.0
528,HAROLD,WHITE,MALE,,1988.0
584,HAROLD,WHITE,MALE,,1988.0
...,...,...,...,...,...
15575,JAIME,VELEZ JR,MALE,HISPANIC,1965.0
15578,RAPHAEL,MITCHEM,MALE,BLACK,1962.0
15581,CARLOS,RAMOS JR,MALE,HISPANIC,1975.0
15582,RONALD,JACKSON JR,MALE,BLACK,1984.0


In [59]:
second_merge_unmatched.loc[:, "off_last_name"] = second_merge_unmatched.loc[:, "off_original_last_name"]
third_merge = pd.merge(second_merge_unmatched.loc[:, uof_df.columns], profs_df, on=join_key_1, how="left", suffixes=("", "_prof"))
third_merge_matched = matches(third_merge, "off_uniq_id", join_key_1)
third_merge_matched.loc[:,"notes"] = "merged on [" + ", ".join(join_key_1) + "] without off_suffix appended"
third_merge_unmatched = unmatched(third_merge, "off_uniq_id", join_key_1)


Based on the join key [off_first_name, off_last_name, off_middle_initial, off_sex, off_race, off_birth_year]:
	Number of matches: 665
Based on the join key [off_first_name, off_last_name, off_middle_initial, off_sex, off_race, off_birth_year]:
	Number of unmatched records: 562


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  second_merge_unmatched.loc[:, "off_last_name"] = second_merge_unmatched.loc[:, "off_original_last_name"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  third_merge_matched.loc[:,"notes"] = "merged on [" + ", ".join(join_key_1) + "] without off_suffix appended"


In [60]:
fourth_merge = pd.merge(third_merge_unmatched.loc[:, uof_df.columns], profs_df, on=join_key_2, how="left", suffixes=("", "_prof"))
fourth_merge_matched = matches(fourth_merge, "off_uniq_id", join_key_2)
fourth_merge_matched.loc[:,"notes"] = "merged on [" + ", ".join(join_key_2) + "] without off_suffix appended"
fourth_merge_unmatched = unmatched(fourth_merge, "off_uniq_id", join_key_2)
fourth_merge_unmatched

Based on the join key [off_first_name, off_last_name, off_sex, off_race, off_birth_year]:
	Number of matches: 9
Based on the join key [off_first_name, off_last_name, off_sex, off_race, off_birth_year]:
	Number of unmatched records: 553


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fourth_merge_matched.loc[:,"notes"] = "merged on [" + ", ".join(join_key_2) + "] without off_suffix appended"


Unnamed: 0,uof_id,report_number,date_time,uof_address,subject_cb_no,off_first_name,off_last_name,unit,watch,off_height,...,off_middle_initial_prof,off_appointed,ranks_held,off_star_0,off_star_1,off_star_2,off_star_3,off_star_4,off_star_5,off_source
0,10244,2017-00066,2017-10-20 23:20:00,70XX COTTAGE GROVE AVE,19552937,RODNEY,JACKSON,3,,,...,,,,,,,,,,
1,10381,2017-00214,2017-11-01 16:04:00,1XX 104TH ST,,KENNETH,GALVIN,189,,508.0,...,,,,,,,,,,
2,10462,2017-00328,2017-11-08 12:13:00,11XX GARFIELD BLVD,19560964,MARK,HERNANDEZ,189,,511.0,...,,,,,,,,,,
3,10860,2017-00784,2017-12-16 16:14:00,63XX COTTAGE GROVE AVE,19577383,HAROLD,WHITE,3,,508.0,...,,,,,,,,,,
4,10936,2017-00865,2017-12-24 20:25:00,61XX ELLIS AVE,19580777,HAROLD,WHITE,3,,507.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557,91355,,2016-02-23 00:15:00,5XX VAN BUREN ST,,JAMES,POLASKI,189,,,...,,,,,,,,,,
558,91359,,2016-02-23 11:05:00,70XX STEWART AVE,,MARK,HERNANDEZ,7,,,...,,,,,,,,,,
559,91395,,2016-02-27 12:38:00,88XX WOOD ST,,RAPHAEL,MITCHEM,189,,,...,,,,,,,,,,
560,91799,,2016-04-01 20:14:00,26XX 63RD ST,,RAPHAEL,MITCHEM,189,,,...,,,,,,,,,,


In [1]:
left_by = ["off_assigned_beat", "off_sex", "off_race", "off_birth_year"]
right_by = ["beat", "off_sex", "off_race", "off_year_of_birth"]

missing_names["date_time"] = pd.to_datetime(missing_names["date_time"])
missing_names.sort_values(by="date_time", inplace=True)

i = 0
match_sets = []
missing_names_remaining = missing_names
missing_names_remaining.dropna(subset=["date_time", "off_birth_year"], inplace=True)
missing_names_remaining["off_birth_year"] = missing_names["off_birth_year"].astype(np.int64)

with pd.read_csv("../files/events/officer_id_merged/assignments/assignments_full_merge.csv", chunksize=10000) as reader:
  for chunk in reader:
    print("processing chunk", i)
    i += 1
    chunk["off_race"] = np.where(chunk["off_race"].isna(), chunk["rdesc"], chunk["off_race"])
    convert_race(chunk, ["off_race"])
    chunk["shift_end_corrected"] = pd.to_datetime(chunk["shift_end_corrected"])
    chunk.dropna(subset=["shift_end_corrected", "off_year_of_birth"], inplace=True)
    chunk.sort_values(by="shift_end_corrected", inplace=True)
    merged_chunk = pd.merge_asof(missing_names_remaining, chunk, left_on="date_time", right_on="shift_end_corrected", left_by=left_by, right_by=right_by, tolerance=pd.Timedelta("12 hours"), suffixes=("", "_assignments"))
    match_sets.append(matches(merged_chunk, signifier="off_uniq_id"))
    missing_names_remaining = unmatched(merged_chunk, signifier="off_uniq_id").loc[:, original_cols]

matched_from_assignments = pd.concat(match_sets)
matched_from_assignments["notes"] = "missing officer name; merged from assignments file"

NameError: name 'pd' is not defined

In [62]:
dfs_to_concat = [successfully_merged, second_merge_successes, third_merge_matched, fourth_merge, matched_from_assignments, missing_names_remaining]
full_merge_df = pd.concat(dfs_to_concat)
ids_merged = full_merge_df["uof_id"].to_numpy()
m = ~original_uof_df.isin({"uof_id": ids_merged})
missed_uof_entries = original_uof_df[m]
full_merge_df = pd.concat([full_merge_df, missed_uof_entries])
full_merge_df.sort_values(by=["date_time", "off_last_name", "off_first_name"], inplace=True)
full_merge_df.drop_duplicates(subset=["uof_id", "off_uniq_id"], inplace=True)
full_merge_df.reset_index(drop=True, inplace=True)
full_merge_df.loc[:, out_cols].to_csv(out_file_path + "/full_merge.csv")
full_merge_dupes = dupes(full_merge_df, "uof_id")


Based on the join key []:
	Number of records with more than one match: 63


In [73]:
# full_merge_df.groupby("uof_id").size()
# missing_names.shape[0]
m = full_merge_df.isin({"uof_id": full_merge_dupes.reset_index()["uof_id"].to_numpy()}).any(1)
full_merge_df[m].loc[:, "off_uniq_id"]
# full_merge_dupes.reset_index()

22192    33146.0
22193    33145.0
23250    33146.0
23251    33145.0
27852    33146.0
          ...   
91126     1444.0
91209    10624.0
91210    10627.0
91213      480.0
91214      479.0
Name: off_uniq_id, Length: 126, dtype: float64