In [1]:
import pandas as pd
from assign_unique_ids_functions import *
import inspect
import itertools
import numpy as np

In [2]:
def intersect(a, b):
     return list(set(a) & set(b))
def setdiff(a,b):
    return list(set(a) - set(b))
def union(a,b):
    return list(set(a) | set(b))

In [3]:
asd = pd.read_csv("input/all-sworn_demographics.csv.gz")
asud = pd.read_csv("input/all-sworn-units_demographics.csv.gz")
print(asd.sort_values('Last.Name').head())
print(asd.sort_values('Last.Name').head())

   all_sworn_ID First.Name Last.Name Suffix.Name Appointed.Date  Birth.Year  \
0         12735    JEFFERY     AARON         NaN     2005-09-26        1971   
1         16413     KARINA     AARON         NaN     2005-09-26        1980   
2          4767     DANIEL     ABATE         NaN     1970-06-15        1942   
3          1207    ANTHONY    ABBATE         NaN     1994-12-05        1968   
4          3134     CARMEL    ABBATE         NaN     1969-01-06        1942   

   Gender      Race Middle.Initial  
0    MALE     WHITE              M  
1  FEMALE  HISPANIC            NaN  
2    MALE     WHITE              P  
3    MALE     WHITE              G  
4    MALE     WHITE              G  
   all_sworn_ID First.Name Last.Name Suffix.Name Appointed.Date  Birth.Year  \
0         12735    JEFFERY     AARON         NaN     2005-09-26        1971   
1         16413     KARINA     AARON         NaN     2005-09-26        1980   
2          4767     DANIEL     ABATE         NaN     1970-06-15   

In [None]:
asd[asd['Last.Name'].str.contains('JUR')].sort_values('First.Name')

In [4]:
def take_first_four(x):
    return x[:4]


def BY_to_CA(x):
    return 2016 - x


def add_columns(df, add_cols = ["F4FN", "F4LN", "Current.Age", "BY_to_CA"]):
    if "F4FN" in add_cols and "First.Name" in df.columns:
            df['F4FN'] = df['First.Name'].map(take_first_four)
    if "F4LN" in add_cols and 'Last.Name' in df.columns:
            df['F4LN'] = df['Last.Name'].map(take_first_four)
    if "Current.Age" in add_cols and "Current.Age" in df.columns:
        df['Current.Age.p1'] = df['Current.Age']
        df['Current.Age.m1'] = df['Current.Age']
    if "BY_to_CA" in add_cols and "Birth.Year" in df.columns:
        by_to_ca = lambda x: 2016 - x
        df['Current.Age.p1'] = df['Birth.Year'].map(BY_to_CA)
        df['Current.Age.m1'] = df['Birth.Year'].map(BY_to_CA) - 1  
    return df

In [5]:
def generate_on_lists(data_cols, base_lists):
    merge_list = []
    
    for col_list in base_lists:
        if intersect(col_list, data_cols):
            ml = intersect(col_list, data_cols)
            if '' in col_list:
                ml.append('')
            merge_list.append(sorted(ml, reverse=True))
            
    merge_list = list(itertools.product(*reversed(merge_list)))
    merge_list = [[i for i in ml if i != ''] for ml in merge_list]
    
    return merge_list

In [6]:
def loop_merge(df1, df2, on_lists, keep_columns, return_unmatched = True):
    dfm = pd.DataFrame(columns = keep_columns + ['Match'])
    for mc in on_lists:
        df1t = remove_duplicates(df1[keep_columns[:1] + mc], mc)
        df2t = remove_duplicates(df2[keep_columns[1:] + mc], mc)
        dfmt = df1t.merge(df2t, on=mc, how='inner')
        if dfmt.shape[0] > 0:
            print('******')
            print(mc)
            print(dfmt.shape[0])
            print('******')
            dfmt['Match'] = '-'.join(mc)
            dfm = dfm.append(dfmt[keep_columns + ['Match']].reset_index(drop=True))
            df1 = df1.loc[~df1[keep_columns[0]].isin(dfm[keep_columns[0]])]
            df2 = df2.loc[~df2[keep_columns[1]].isin(dfm[keep_columns[1]])]
    print(dfm.shape[0], df1.shape[0], df2.shape[0])
    if return_unmatched:
        return (dfm.reset_index(drop=True), df1, df2)
    else:
        return dfm.reset_index(drop=True)

In [7]:
def merge_datasets(df1, df2, keep_columns,
                   custom_matches = [], return_unmatched = True, name_changes=True):
    df1 = df1.dropna(axis=1, how='all')
    df2 = df2.dropna(axis=1, how='all')
    
    if "Birth.Year" not in intersect(df1.columns, df2.columns):
        add_cols = ["F4FN", "F4LN", "BY_to_CA", "Current.Age"]
    else:
        add_cols = ["F4FN", "F4LN"]
        
    df1 = add_columns(df1, add_cols)
    df2 = add_columns(df2, add_cols)
    
    
    cols = intersect(df1.columns, df2.columns)
    
    df1 = df1[[col for col in df1.columns
               if col in cols or col == keep_columns[0]]]
    df2 = df2[[col for col in df2.columns
               if col in cols or col == keep_columns[1]]]
    
    base_lists = [
        ['Current.Star', 'Star1', 'Star2', 'Star3', 'Star4', 'Star5','Star6', 'Star7', 'Star8', 'Star9', 'Star10'],
        ['First.Name', 'F4FN'], ['Last.Name', 'F4LN'], ['Appointed.Date'],
        ['Birth.Year', 'Current.Age', 'Current.Age.p1', 'Current.Age.m1', ''],
        ['Middle.Initial', ''], ['Gender', ''], ['Race', ''], ['Suffix.Name', '']
    ]
    
    on_lists = generate_on_lists(cols, base_lists)
    
    if custom_matches:
        on_lists.append(custom_matches)
    
    if name_changes:
        nc_lists = generate_on_lists(cols, [ml for ml in base_lists if "Last.Name" not in ml])
        nc_lists = [nc_list for nc_list in nc_lists if len(nc_list) > 3]
        on_lists.extend(nc_lists)
    
    merged_data = loop_merge(df1, df2, on_lists=on_lists, keep_columns=keep_columns, return_unmatched=return_unmatched)
        
    return(merged_data)

In [8]:
def append_to_reference(df1, df2, keep_columns, custom_matches = [], return_unmatched=False, name_changes=True):
    ml = merge_datasets(df1, df2, keep_columns=keep_columns, 
                        custom_matches=custom_matches, name_changes=name_changes)

    ref = pd.concat([ml[0][keep_columns],
                    ml[1][[keep_columns[0]]],
                    ml[2][[keep_columns[1]]]])[
        keep_columns].reset_index(drop=True)

    if "UID" not in ref.columns:
        ref.insert(0, 'UID', ref.index + 1)
    else:
        ref = ref.sort_values('UID', na_position='last')
        ref['UID'] = ref.index + 1
    df1 = df1.merge(ref, on=keep_columns[0], how='left')
    df2 = df2.merge(ref, on=keep_columns[1], how='left')

    ref = pd.concat([df1, df2]).reset_index(drop=True)
    if return_unmatched:
        return (ref, ml[1], ml[2])
    else:
        return ref

In [9]:
t1 = append_to_reference(asd, asud, ["all_sworn_ID", "all_sworn_units_ID"])

******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Birth.Year', 'Appointed.Date', 'Last.Name', 'First.Name']
32006
******


  stride //= shape[i]


32006 0 0


In [10]:
ref = t1
profile_cols = ["First.Name", "Last.Name", "Middle.Initial", "Suffix.Name",
                "Appointed.Date", "Birth.Year", "Current.Unit", "Race", "Gender",
               "all_sworn_ID","all_sworn_units_ID"]
mode_cols = profile_cols
profiles = aggregate_data(ref, "UID", mode_cols = mode_cols)
print(profiles.shape)

(32006, 12)


In [13]:
profiles[profiles['Last.Name']=='AARON']

Unnamed: 0,UID,First.Name,Last.Name,Middle.Initial,Suffix.Name,Appointed.Date,Birth.Year,Current.Unit,Race,Gender,all_sworn_ID,all_sworn_units_ID
0,25766,JEFFERY,AARON,M,,2005-09-26,1971,3.0,WHITE,MALE,12735,12735
1,9516,KARINA,AARON,,,2005-09-26,1980,15.0,HISPANIC,FEMALE,16413,16413


In [32]:
amd = pd.read_csv("input/all-members_demographics.csv.gz")
print(amd.shape)
print(len(amd['all_members_ID'].unique()))
amd.head()

(31925, 9)
31925


Unnamed: 0,all_members_ID,First.Name,Last.Name,Suffix.Name,Appointed.Date,Birth.Year,Gender,Race,Middle.Initial
0,12696,JEFFERY,AARON,,2005-09-26,1971,MALE,WHITE,M
1,16366,KARINA,AARON,,2005-09-26,1980,FEMALE,HISPANIC,
2,4746,DANIEL,ABATE,,1970-06-15,1942,MALE,WHITE,P
3,1201,ANTHONY,ABBATE,,1994-12-05,1968,MALE,WHITE,G
4,3122,CARMEL,ABBATE,,1969-01-06,1942,MALE,WHITE,G


In [15]:
profiles.head()

Unnamed: 0,UID,First.Name,Last.Name,Middle.Initial,Suffix.Name,Appointed.Date,Birth.Year,Current.Unit,Race,Gender,all_sworn_ID,all_sworn_units_ID
0,25766,JEFFERY,AARON,M,,2005-09-26,1971,3.0,WHITE,MALE,12735,12735
1,9516,KARINA,AARON,,,2005-09-26,1980,15.0,HISPANIC,FEMALE,16413,16413
2,26592,DANIEL,ABATE,P,,1970-06-15,1942,543.0,WHITE,MALE,4767,4767
3,20300,ANTHONY,ABBATE,G,,1994-12-05,1968,20.0,WHITE,MALE,1207,1207
4,20068,CARMEL,ABBATE,G,,1969-01-06,1942,640.0,WHITE,MALE,3134,3134


In [16]:
t2 = append_to_reference(profiles, amd, ["UID", "all_members_ID"],return_unmatched=False)

******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Birth.Year', 'Appointed.Date', 'Last.Name', 'First.Name']
31911
******
******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Birth.Year', 'Appointed.Date', 'F4LN', 'First.Name']
3
******
******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Appointed.Date', 'Last.Name', 'First.Name']
1
******
******
['Suffix.Name', 'Gender', 'Middle.Initial', 'Birth.Year', 'Appointed.Date', 'Last.Name', 'First.Name']
1
******
******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Birth.Year', 'Appointed.Date', 'First.Name']
8
******
31924 82 1


In [17]:
profile_cols = ["First.Name", "Last.Name", "Middle.Initial", "Suffix.Name",
                "Appointed.Date", "Birth.Year", "Current.Unit", "Race", "Gender",
               "all_sworn_ID","all_sworn_units_ID", "all_members_ID"]
mode_cols = profile_cols

In [18]:
print(mode_cols)
t2.head()

['First.Name', 'Last.Name', 'Middle.Initial', 'Suffix.Name', 'Appointed.Date', 'Birth.Year', 'Current.Unit', 'Race', 'Gender', 'all_sworn_ID', 'all_sworn_units_ID', 'all_members_ID']


Unnamed: 0,Appointed.Date,Birth.Year,Current.Unit,First.Name,Gender,Last.Name,Middle.Initial,Race,Suffix.Name,UID,all_members_ID,all_sworn_ID,all_sworn_units_ID
0,2005-09-26,1971,3.0,JEFFERY,MALE,AARON,M,WHITE,,25766,24612,12735,12735
1,2005-09-26,1980,15.0,KARINA,FEMALE,AARON,,HISPANIC,,9516,13138,16413,16413
2,1970-06-15,1942,543.0,DANIEL,MALE,ABATE,P,WHITE,,26592,11291,4767,4767
3,1994-12-05,1968,20.0,ANTHONY,MALE,ABBATE,G,WHITE,,20300,19792,1207,1207
4,1969-01-06,1942,640.0,CARMEL,MALE,ABBATE,G,WHITE,,20068,23136,3134,3134


In [19]:
profiles2 = aggregate_data(t2, "UID", mode_cols = mode_cols)

First.Name




Last.Name
Middle.Initial
Suffix.Name
Appointed.Date
Birth.Year
Race
Gender


In [20]:
profiles.head()

Unnamed: 0,UID,First.Name,Last.Name,Middle.Initial,Suffix.Name,Appointed.Date,Birth.Year,Current.Unit,Race,Gender,all_sworn_ID,all_sworn_units_ID
0,25766,JEFFERY,AARON,M,,2005-09-26,1971,3.0,WHITE,MALE,12735,12735
1,9516,KARINA,AARON,,,2005-09-26,1980,15.0,HISPANIC,FEMALE,16413,16413
2,26592,DANIEL,ABATE,P,,1970-06-15,1942,543.0,WHITE,MALE,4767,4767
3,20300,ANTHONY,ABBATE,G,,1994-12-05,1968,20.0,WHITE,MALE,1207,1207
4,20068,CARMEL,ABBATE,G,,1969-01-06,1942,640.0,WHITE,MALE,3134,3134


In [21]:
profiles2.head()

Unnamed: 0,UID,First.Name,Last.Name,Middle.Initial,Suffix.Name,Appointed.Date,Birth.Year,Current.Unit,Race,Gender,all_sworn_ID,all_sworn_units_ID,all_members_ID
0,25766,JEFFERY,AARON,M,,2000-02-28,1971,3.0,WHITE,MALE,12735,12735,24612
1,9516,JESSICA,AARON,,,2005-09-26,1980,15.0,HISPANIC,FEMALE,16413,16413,13138
2,26592,DANIEL,ABATE,P,,1970-06-15,1942,543.0,WHITE,MALE,4767,4767,11291
3,20300,ANTHONY,ABBATE,G,,1994-12-05,1968,20.0,WHITE,MALE,1207,1207,19792
4,20068,CARMEL,ABBATE,G,,1969-01-06,1942,640.0,WHITE,MALE,3134,3134,23136


In [26]:
t2[t2['all_members_ID']==24612]

Unnamed: 0,Appointed.Date,Birth.Year,Current.Unit,First.Name,Gender,Last.Name,Middle.Initial,Race,Suffix.Name,UID,all_members_ID,all_sworn_ID,all_sworn_units_ID
0,2005-09-26,1971,3.0,JEFFERY,MALE,AARON,M,WHITE,,25766,24612,12735.0,12735.0
57766,2000-02-28,1975,,RICHARD,MALE,SCHLECHT,M,WHITE,,25766,24612,,


In [29]:
t2[t2['all_members_ID']==12696]

Unnamed: 0,Appointed.Date,Birth.Year,Current.Unit,First.Name,Gender,Last.Name,Middle.Initial,Race,Suffix.Name,UID,all_members_ID,all_sworn_ID,all_sworn_units_ID
2000,1998-12-14,1969,44.0,STEVEN,MALE,BIENEMAN,M,WHITE,,25685,12696,28154.0,28154.0
32006,2005-09-26,1971,,JEFFERY,MALE,AARON,M,WHITE,,25685,12696,,


In [27]:
amd[amd['all_members_ID']==24612]

Unnamed: 0,all_members_ID,First.Name,Last.Name,Suffix.Name,Appointed.Date,Birth.Year,Gender,Race,Middle.Initial
25760,24612,RICHARD,SCHLECHT,,2000-02-28,1975,MALE,WHITE,M


In [30]:
amd[amd['all_members_ID']==12696]

Unnamed: 0,all_members_ID,First.Name,Last.Name,Suffix.Name,Appointed.Date,Birth.Year,Gender,Race,Middle.Initial
0,12696,JEFFERY,AARON,,2005-09-26,1971,MALE,WHITE,M


In [28]:
amd[(amd['First.Name']=='JEFFERY') & (amd['Last.Name']=='AARON')]

Unnamed: 0,all_members_ID,First.Name,Last.Name,Suffix.Name,Appointed.Date,Birth.Year,Gender,Race,Middle.Initial
0,12696,JEFFERY,AARON,,2005-09-26,1971,MALE,WHITE,M


In [None]:
# Wtf... ok so check id matching within the
# append to reference step to make sure matches are right
# Then look at aggregate data again

In [34]:
uhd = pd.read_csv("input/unit-history_demographics.csv.gz")

In [35]:
uhd.columns

Index(['unit_history_ID', 'First.Name', 'Last.Name', 'Middle.Initial',
       'Suffix.Name', 'Appointed.Date', 'Current.Age', 'Gender', 'Race',
       'Star1', 'Star2', 'Star3', 'Star4', 'Star5', 'Star6', 'Star7', 'Star8',
       'Star9', 'Star10', 'Current.Unit'],
      dtype='object')

In [38]:
t3 = append_to_reference(profiles, uhd, ["UID", "unit_history_ID"],return_unmatched=False)

******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Current.Age.p1', 'Appointed.Date', 'Last.Name', 'First.Name']
1311
******
******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Current.Age.m1', 'Appointed.Date', 'Last.Name', 'First.Name']
4284
******
******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Current.Age.m1', 'Appointed.Date', 'F4LN', 'First.Name']
2
******
******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Appointed.Date', 'Last.Name', 'First.Name']
1
******
******
['Suffix.Name', 'Race', 'Gender', 'Current.Age.p1', 'Appointed.Date', 'Last.Name', 'First.Name']
6014
******
******
['Suffix.Name', 'Race', 'Gender', 'Current.Age.m1', 'Appointed.Date', 'Last.Name', 'First.Name']
19937
******
******
['Suffix.Name', 'Race', 'Gender', 'Current.Age.m1', 'Appointed.Date', 'Last.Name', 'F4FN']
1
******
******
['Suffix.Name', 'Race', 'Gender', 'Current.Age.m1', 'Appointed.Date', 'F4LN', 'First.Name']
4
******
******
['Suffix.Name', 'Race', 'Gender', 'App

In [39]:
profile_cols = ["First.Name", "Last.Name", "Middle.Initial", "Suffix.Name",
                "Appointed.Date", "Birth.Year", "Current.Unit", "Race", "Gender",
               "all_sworn_ID","all_sworn_units_ID", "unit_history_ID"]
mode_cols = profile_cols
profiles2 = aggregate_data(t3, "UID", mode_cols = mode_cols)

First.Name




Last.Name
Middle.Initial
Suffix.Name
Appointed.Date
Current.Unit
Race
Gender


In [40]:
profiles2.head()

Unnamed: 0,UID,First.Name,Last.Name,Middle.Initial,Suffix.Name,Appointed.Date,Birth.Year,Current.Unit,Race,Gender,all_sworn_ID,all_sworn_units_ID,unit_history_ID
0,25766,JEFFERY,AARON,M,,1970-06-15,1971.0,3.0,WHITE,MALE,12735,12735,1617
1,9516,AUGUST,AARON,,,1976-10-18,1980.0,9.0,HISPANIC,FEMALE,16413,16413,720
2,26592,DANIEL,ABATE,P,,1970-06-15,1942.0,1.0,WHITE,MALE,4767,4767,26462
3,20300,ANTHONY,ABBATE,G,,1994-12-05,1968.0,7.0,WHITE,MALE,1207,1207,18549
4,20068,CARMEL,ABBATE,G,,1969-01-06,1942.0,123.0,WHITE,FEMALE,3134,3134,5678


In [41]:
uhd[uhd['unit_history_ID']==1617]

Unnamed: 0,unit_history_ID,First.Name,Last.Name,Middle.Initial,Suffix.Name,Appointed.Date,Current.Age,Gender,Race,Star1,Star2,Star3,Star4,Star5,Star6,Star7,Star8,Star9,Star10,Current.Unit
1616,1617,THOMAS,BECKER,,,1970-06-15,66,MALE,WHITE,,,251,43,-999,-999,-999,-999,-999,-999,19.0
