In [1]:
import pandas as pd

unique_key = ["off_last_name","off_first_name","off_middle_initial","off_rank","off_star","off_race","off_sex","off_year_of_birth","off_appointed"]
profs_df = pd.DataFrame(columns=unique_key)
intermediary_dfs = []

with pd.read_csv("../files/events/assignment_cleaned.csv", chunksize=10000) as reader:
  for chunk in reader:
    profs_df = pd.concat([profs_df, chunk.loc[:,unique_key]]).drop_duplicates(subset=unique_key)

profs_df.sort_values(by=["off_last_name", "off_first_name", "off_middle_initial"], inplace=True, ignore_index=True)
profs_df.to_csv("../files/profiles/officer_profs_test.csv")

In [2]:
import pandas as pd
import numpy as np
from functools import partial

def star_at(index: int, stars: pd.Series) -> int:
  try:
    return int(stars.dropna().sort_values().reset_index(drop=True).iat[index])
  except:
    return np.nan

profs = pd.read_csv("../files/profiles/officer_profs_test.csv")
# profs[profs["off_race"].isna()].to_csv("../files/profiles/missing_race.csv")
cols = ["off_last_name","off_first_name","off_middle_initial","off_rank","off_star","off_race","off_sex","off_year_of_birth","off_appointed"]
group_key = ["off_last_name","off_first_name","off_middle_initial","off_sex","off_year_of_birth","off_appointed"]
# for the off_star_x cols to be useful we probably want to order by date desc
# otherwise this is just useful for disambiguation
profs_df = profs.groupby(group_key, dropna=False).agg(
  off_race=("off_race", "first"),
  ranks_held=("off_rank", lambda ranks: ', '.join(ranks.dropna().unique())),
  off_star_0=("off_star", partial(star_at, 0)),
  off_star_1=("off_star", partial(star_at, 1)),
  off_star_2=("off_star", partial(star_at, 2)),
  off_star_3=("off_star", partial(star_at, 3)),
  off_star_4=("off_star", partial(star_at, 4)),
  off_star_5=("off_star", partial(star_at, 5)),
).reset_index()

""" just for bookkeeping """
profs_df["off_source"] = "assignments_file"

profs_df.index.name = "off_uniq_id"

profs_df.to_csv("../files/profiles/officers_index.csv")

In [3]:
print(profs_df.shape[0])
print(profs_df[profs_df["off_race"].isna()].shape[0])
profs_df.dropna(subset=["off_star_5"])


15906
507


Unnamed: 0_level_0,off_last_name,off_first_name,off_middle_initial,off_sex,off_year_of_birth,off_appointed,off_race,ranks_held,off_star_0,off_star_1,off_star_2,off_star_3,off_star_4,off_star_5,off_source
off_uniq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
5064,GOLDEN,MICHAEL,A,M,1991,2015-12-14,WHITE,POLICE OFFICER,3156.0,16833.0,18771.0,19217.0,19686.0,19686.0,assignments_file
13727,STEVENS,JILL,M,F,1978,2002-03-25,WHITE,COMMANDER,24.0,178.0,626.0,627.0,1222.0,1222.0,assignments_file


In [57]:
import pandas as pd
import datetime
from data_tweak.converters import convert_race, convert_sex

# fix a weird date conversion issue that happens w/ 2-digit dates
def fix_future_dates(date: datetime) -> datetime:
  if pd.isnull(date):
    return pd.NaT
  elif date.year > 2022:
    return date - pd.DateOffset(years=100)
  else:
    return date

officers_index_df = pd.read_csv("../files/profiles/officers_index.csv", index_col="off_uniq_id", parse_dates=["off_appointed"], date_parser=lambda d: pd.to_datetime(d, format="%Y-%m-%d"))
kiefer_roster_df = pd.read_csv("../files/profiles/kiefer_roster.csv", parse_dates=["Appointed Date", "Resignation Date"], date_parser=lambda d: pd.to_datetime(d, format="%d-%b-%y"))
kiefer_roster_df["Appointed Date"] = kiefer_roster_df["Appointed Date"].apply(fix_future_dates)
kiefer_roster_df["Resignation Date"] = kiefer_roster_df["Resignation Date"].apply(fix_future_dates)
convert_race(officers_index_df, ["off_race"])
convert_sex(officers_index_df, ["off_sex"])
convert_race(kiefer_roster_df, ["Race"])
convert_sex(kiefer_roster_df, ["Gender"])

# first, assign birthdate ranges
kiefer_roster_df = kiefer_roster_df.rename(columns={
  "Last Name": "off_last_name",
  "First Name": "off_first_name",
  "Middle Initital": "off_middle_initial",
  "Gender": "off_sex",
  "Race": "off_race",
  "D.O.B.": "off_year_of_birth",
  "Appointed Date": "off_appointed",
  "Resignation Date": "off_resignation",
  "Description": "ranks_held",
  "Star 1": "off_star_0",
  "Star 2": "off_star_1",
  "Star 3": "off_star_2",
  "Star 4": "off_star_3",
  "Star 5": "off_star_4",
  "Star 6": "off_star_5"
})

kiefer_roster_df["off_source"] = "kiefer"
kiefer_roster_df.index.name = "off_uniq_id"
kiefer_roster_df["off_year_of_birth"] = kiefer_roster_df["off_year_of_birth"].astype(int, errors="ignore")

# kiefer_roster_df
kiefer_roster_df = kiefer_roster_df.loc[:, officers_index_df.columns]
print(type(officers_index_df.off_appointed.iat[0]))
print(type(kiefer_roster_df.off_appointed.iat[0]))

output_df = pd.concat([officers_index_df, kiefer_roster_df], ignore_index=True).drop_duplicates(subset=["off_last_name","off_first_name","off_middle_initial","off_sex", "off_year_of_birth", "off_appointed", "off_race"]).sort_values(by=["off_last_name", "off_first_name", "off_middle_initial"], ignore_index=True)
output_df.index.name = "off_uniq_id"
output_df.to_csv("../files/profiles/officer_index_plus_kiefer.csv")

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
