In [1]:
import sys; sys.path.append("../")

import pandas as pd

from lib.explore import show_unique, print_df
from lib.path import data_file_path
from lib.match import (
    ColumnsIndex, StringSimilarity, JaroWinklerSimilarity, DateSimilarity, ThresholdMatcher
)
from lib.date import combine_date_columns
from clean.new_orleans_pd_pprr import clean
from clean.new_orleans_csd_pprr import clean_2014 as clean_csd_2014, clean_2009 as clean_csd_2009

# New Orleans PPRR

## Cleaning 1999-2020 NO PD PPRR

In [2]:
pd_pprr = clean()
pd_pprr[pd_pprr.data_production_year == "2009"]

Unnamed: 0,employee_id,birth_year,department_code,rank_code,rank_desc,badge_no,last_name,first_name,middle_initial,data_production_year,agency,uid,middle_name,department_desc,hire_year,hire_month,hire_day
0,569,1963,2702736,C7134,senior police officer,00911,conway,corliss,d,2009,New Orleans PD,8b16ebc056e613024c057be590b542eb,,,,,
1,1142,1958,2702740,C7134,senior police officer,,plustache,daniel,,2009,New Orleans PD,8ce6790cc6a94e65f17f908f462fae85,,,,,
2,1391,1978,2702735,C7132,police officer 2,,marshall,tianay,m,2009,New Orleans PD,7f53f8c6c730af6aeb52e66eb74d8507,,,,,
3,1573,1970,2702731,C7133,police officer 3,,austin,larissa,j,2009,New Orleans PD,a1afc58c6ca9540d057299ec3016d726,,,,,
4,1599,1975,2702750,C7110,police officer,,weber,tamara,l,2009,New Orleans PD,83e8ef518174e1eb6be4a0778d050c9d,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1417,19778,1970,2702750,C7110,police officer,,powell,jeffrey,,2009,New Orleans PD,9d203879c2650125412c8c1e894be590,,,,,
1418,19782,1977,2702733,C7110,police officer,,cummings,eugene,,2009,New Orleans PD,8651920f3ba2e5f8bcd3e58ba0b48584,,,,,
1419,19805,1970,2702736,C7110,police officer,,burnette,jonathan,,2009,New Orleans PD,2aa49677fddf127f4230c74ece40b8a7,,,,,
1420,19938,1977,2702732,C7110,police officer,,moore,michael,,2009,New Orleans PD,bea274dd9c29a248928ccf211b52750f,,,,,


## Matching 2014 PD PPRR against 2014 CSD PPRR 

In [3]:
csd_pprr_14 = clean_csd_2014()

pddf = pd_pprr.loc[pd_pprr.data_production_year == "2014", ["uid", "first_name", "last_name", "birth_year"]]
pddf = pddf.drop_duplicates(["uid"]).set_index("uid", drop=True)

csddf = csd_pprr_14[["mid", "first_name", "last_name", "birth_year"]]
# csddf.loc[:, "hire_date"] = combine_date_columns(csd_pprr, "hire_year", "hire_month", "hire_day")
csddf = csddf.drop_duplicates(["mid"]).set_index("mid", drop=True)

matcher = ThresholdMatcher(pddf, csddf, ColumnsIndex(["birth_year"]), {
    "last_name": JaroWinklerSimilarity(),
    "first_name": JaroWinklerSimilarity(),
})

In [4]:
print_df(matcher.get_sample_pairs(lower_bound=0.9))

                                                                  first_name last_name birth_year
score_range pair_idx sim_score row_key                                                           
1.00-0.95   0        1.000000  c0d447b768342332a35134881a57bef6       jeremy    wilcox       1979
                               31aba9fa946fd7d6dd581abff346d82b       jeremy    wilcox       1979
            1        1.000000  60a6c4002cc7b29142def8871531281a         gary       lee       1951
                               e5e77467097e3373f9021d7a2ad9443f         gary       lee       1951
            2        1.000000  6e96be832cf8bc6b35a956e8fb66c76a    christian      hart       1972
                               b69d075343200394cbbf4a4cef2c95df    christian      hart       1972
            3        0.993770  6504909ab81edf8086a8ec6982a1099a      michael  kitchens       1970
                               50a6219da7b5ce39635575821a844c4b      michael   kitchen       1970
            4       

In [5]:
pairs = matcher.get_index_pairs_within_thresholds(lower_bound=0.97)
num_pairs = len(pairs)
print("%d matched pairs for 2014 (%d%% of PD records, %d%% of CSD records)" % (num_pairs, num_pairs / pddf.shape[0] * 100, num_pairs / csddf.shape[0] * 100))

13 matched pairs for 2014 (1% of PD records, 0% of CSD records)


## Matching 2009 PD PPRR against 2009 CSD PPRR 

In [6]:
csd_pprr_09 = clean_csd_2009()

pddf2 = pd_pprr.loc[pd_pprr.data_production_year == "2009", ["uid", "first_name", "last_name", "rank_code", "rank_desc"]]
pddf2 = pddf2.drop_duplicates(["uid"]).set_index("uid", drop=True)
pddf2.loc[:, "fc"] = pddf2.first_name.map(lambda x: x[:1])

csddf2 = csd_pprr_09[["mid", "first_name", "last_name", "rank_code", "rank_desc"]]
csddf2 = csddf2.drop_duplicates(["mid"]).set_index("mid", drop=True)
csddf2.loc[:, "fc"] = csddf2.first_name.map(lambda x: x[:1])

matcher2 = ThresholdMatcher(pddf2, csddf2, ColumnsIndex(["fc"]), {
    "last_name": JaroWinklerSimilarity(),
    "first_name": JaroWinklerSimilarity(),
})

In [7]:
print_df(matcher2.get_sample_pairs(step=0.025, lower_bound=0.8))

                                                                  first_name        last_name rank_code                       rank_desc fc
score_range pair_idx sim_score row_key                                                                                                    
1.00-0.97   0        0.990951  d60743aab4b625940d39b3b51c3c6a78     sterling         williams     C7111                 police sergeant  s
                               f237583945fdd1e7c083ee4e9f5a04c1     sterling      williams jr     C7111                 police sergeant  s
            1        0.990951  6613167eafbef4130c088f48920bfdea        eddie         williams     C7110                  police officer  e
                               63e3bf02e9b027ddc4f40c59cd37504e        eddie      williams sr     C7143     police technical specialist  e
            2        0.985297  16d459c4b261f3015061a7cc7362851c       monica         mclaurin     C7110                  police officer  m
                           

In [20]:
matcher2.print_decision(0.97)

for threshold 0.970:
  572 matched pairs (40% of A, 29% of B)


In [9]:
matcher2.save_pairs_to_excel(data_file_path("new_orleans_2.xlsx"), 0.96, lower_bound=0.9)

## Matching 1999-2020 PD PPRR against 2014 CSD PPRR 

In [10]:
pddf3 = pd_pprr.loc[pd_pprr.data_production_year != "2018", ["uid", "first_name", "last_name", "birth_year"]]
pddf3 = pddf3.drop_duplicates(["uid"]).set_index("uid", drop=True)

csddf3 = csd_pprr_14[["mid", "first_name", "last_name", "birth_year"]]
csddf3 = csddf3.drop_duplicates(["mid"]).set_index("mid", drop=True)

matcher3 = ThresholdMatcher(pddf3, csddf3, ColumnsIndex(["birth_year"]), {
    "last_name": JaroWinklerSimilarity(),
    "first_name": JaroWinklerSimilarity(),
})

In [11]:
print_df(matcher3.get_sample_pairs(lower_bound=0.8))

                                                                   first_name    last_name birth_year
score_range pair_idx sim_score row_key                                                               
1.00-0.95   0        0.985297  16d459c4b261f3015061a7cc7362851c        monica     mclaurin       1982
                               e8479c7a46d4edde888f348df4e167c7        monica    mc laurin       1982
            1        0.983475  967c2ae04b169f07e7fa8fdfd110551e        currin  guillory jr       1953
                               560c02d765705f8d642eb55c7818970e      currin c  guillory jr       1953
            2        0.978030  33beffd09a1b020d1187c6b4b264014a         henry       newton       1944
                               cf2c24441fa36a5ed959c0902edbb0de         henry    newton jr       1944
            3        0.978030  c6ae9174774e254650073722e5b92a8f          john       brunet       1954
                               8f68dd8f3eb97136dfdc4e208c1619b8          john    b

In [12]:
pairs = matcher3.get_index_pairs_within_thresholds(lower_bound=0.97)
num_pairs = len(pairs)
print("%d matched pairs for PD all years vs CSD 2014 (%d%% of PD records, %d%% of CSD records)" %
    (num_pairs, num_pairs / pddf3.shape[0] * 100, num_pairs / csddf3.shape[0] * 100))

1544 matched pairs for PD all years vs CSD 2014 (50% of PD records, 42% of CSD records)


## Matching 1999-2020 PD PPRR against 2009 CSD PPRR 

In [13]:
pddf4 = pd_pprr.loc[pd_pprr.data_production_year != "2018", ["uid", "first_name", "last_name", "rank_code", "rank_desc"]]
pddf4 = pddf4.drop_duplicates(["uid"]).set_index("uid", drop=True)
pddf4.loc[:, "fc"] = pddf4.first_name.map(lambda x: x[:1])

csddf4 = csd_pprr_09[["mid", "first_name", "last_name", "rank_code", "rank_desc"]]
csddf4 = csddf4.drop_duplicates(["mid"]).set_index("mid", drop=True)
csddf4.loc[:, "fc"] = csddf4.first_name.map(lambda x: x[:1])

matcher4 = ThresholdMatcher(pddf4, csddf4, ColumnsIndex(["fc"]), {
    "last_name": JaroWinklerSimilarity(),
    "first_name": JaroWinklerSimilarity(),
})

In [14]:
print_df(matcher4.get_sample_pairs(step=0.025, lower_bound=0.9))

                                                                first_name   last_name rank_code                       rank_desc fc
score_range pair_idx sim_score row_key                                                                                             
1.00-0.97   0        0.985114  84d2f3b46b8baf9d28b497f803e7f46d       john     richard     C7134           senior police officer  j
                               fc14dde188e0a0e173bdb00367428768       john  richardson       NaN  protective services officer iv  j
            1        0.980204  629cfb1750e1aafd9fd8b37d5fa6e982      erica    williams     C7110                  police officer  e
                               507498fd06ac7f9ceb34d6d2e0a31d2f       eric    williams     C7111                 police sergeant  e
            2        0.978030  c8194823937cbafcfcef5b0c21de59bd    jeffrey      vappie     C7111                 police sergeant  j
                               159899c80f7097283c2299862d948fde    jeffrey  

In [15]:
matcher4.print_decision(0.97)

for threshold 0.970:
  656 matched pairs (21% of A, 34% of B)


In [16]:
matcher4.save_pairs_to_excel(data_file_path("new_orleans_4.xlsx"), 0.97, lower_bound=0.9)

In [17]:
csd_pprr_09.rank_code.unique()

array(['C7134', 'C7109', 'C7133', 'U0018', 'C7110', 'C7132', 'C0020',
       'C7104', 'C7113', 'C7112', 'C7101', 'C7111', 'C7142', 'C7108',
       'C0363', 'C0402', 'C0070', 'C6170', 'C7102', 'C7162', 'C1116',
       'C7141', 'C6026', 'C0145', 'C7122', 'C7106', 'C7143', 'C0061',
       'C0029', 'C0364', 'C7107', 'C7105', nan, 'C7103', 'C0365', 'C5251',
       'C0032', 'C0060', 'C6120', 'C5250', 'C2323', 'C7144', 'C0171',
       'C2501', 'C0420', 'C6027', 'C0161', 'C0410', 'C0238', 'C2502'],
      dtype=object)

In [18]:
csd_pprr_14.rank_code.unique()

array(['C0025', 'C5201', 'C7134', 'C7112', 'C7132', 'C7110', 'C7109',
       'C7162', 'C0413', 'C7107', 'C7133', 'C7101', 'C7111', 'C6205',
       'C7106', 'C7141', 'C7105', 'C7170', 'C7113', 'C0020', 'C0100',
       'C7150', 'C0043', 'C0140', 'C7108', 'C7171', 'C0322', 'C0420',
       'C7143', 'C5250', 'C0141', 'C6025', 'C0023', 'C0061', 'C0022',
       'C0102', 'C7142', 'C7159', 'C7102', 'C6026', 'C0101', 'C7172',
       'C0132', 'C7122', 'C2320', 'C0030', 'C7173', 'C0021', 'C1116',
       'C2501', 'C0238', 'C0244', 'C4048', 'C0246', 'C6033', 'R9999',
       'C1741', 'U0138', 'U0018', 'C0070', 'C0243', 'C6027', 'C0162',
       'C5251', 'C7186', 'C7185', 'C2500', 'C0145', 'C7103', 'C0403',
       'C0249', 'C0409', 'C2502', 'C7104', 'C0363', 'C2322', 'C0421',
       'U0003', 'C0360', 'C6170', 'C5253', 'C0032', 'C0060', 'C2323',
       'C0364', 'C0361', 'C0402', 'C5256', 'C0366', 'C0410', 'C5252',
       'C2321', 'U6080', 'C0178', 'C0362', 'C7144'], dtype=object)

In [19]:
pd_rank_code = pd_pprr.rank_code.unique().tolist()
l1 = csd_pprr_09[csd_pprr_09.rank_code.isin(pd_rank_code)].drop_duplicates(["mid"]).shape[0]
l2 = csd_pprr_09.drop_duplicates(["mid"]).shape[0]
print("%d (out of %d so %d%%)" % (l1, l2, l1 / l2 * 100))

1491 (out of 1910 so 78%)
