In [161]:
import pandas as pd
from assign_unique_ids_functions import *
import inspect
import itertools

In [162]:
def intersect(a, b):
     return list(set(a) & set(b))
def setdiff(a,b):
    return list(set(a) - set(b))
def union(a,b):
    return list(set(a) | set(b))

In [163]:
asd = pd.read_csv("input/all-sworn_demographics.csv.gz")
asud = pd.read_csv("input/all-sworn-units_demographics.csv.gz")

In [251]:
def take_first_four(x):
    return x[:4]


def BY_to_CA(x):
    return 2016 - x


def add_columns(df, add_cols = ["F4FN", "F4LN", "Current.Age", "BY_to_CA"]):
    if "F4FN" in add_cols and "First.Name" in df.columns:
            df['F4FN'] = df['First.Name'].map(take_first_four)
    if "F4LN" in add_cols and 'Last.Name' in df.columns:
            df['F4LN'] = df['Last.Name'].map(take_first_four)
    if "Current.Age" in add_cols and "Current.Age" in df.columns:
        df['Current.Age.p1'] = df['Current.Age']
        df['Current.Age.m1'] = df['Current.Age']
    if "BY_to_CA" in add_cols and "Birth.Year" in df.columns:
        by_to_ca = lambda x: 2016 - x
        df['Current.Age.p1'] = df['Birth.Year'].map(BY_to_CA)
        df['Current.Age.m1'] = df['Birth.Year'].map(BY_to_CA) - 1  
    return df

In [226]:
def generate_on_lists(data_cols, base_lists):
    merge_list = []
    
    for col_list in base_lists:
        if intersect(col_list, data_cols):
            ml = intersect(col_list, data_cols)
            if '' in col_list:
                ml.append('')
            merge_list.append(sorted(ml, reverse=True))
            
    merge_list = list(itertools.product(*reversed(merge_list)))
    merge_list = [[i for i in ml if i != ''] for ml in merge_list]
    
    return merge_list

In [227]:
def loop_merge(df1, df2, on_lists, keep_columns = ['ID1', 'ID2'], return_unmatched = True):
    dfm = pd.DataFrame(columns = keep_columns + ['Match'])
    for mc in on_lists:
        df1t = remove_duplicates(df1[keep_columns[:1] + mc], mc)
        df2t = remove_duplicates(df2[keep_columns[1:] + mc], mc)
        dfmt = df1t.merge(df2t, on=mc, how='inner')
        if dfmt.shape[0] > 0:
            print('******')
            print(mc)
            print(dfmt.shape[0])
            print('******')
            dfmt['Match'] = '-'.join(mc)
            dfm = dfm.append(dfmt[keep_columns + ['Match']].reset_index(drop=True))
            df1 = df1.loc[~df1[keep_columns[0]].isin(dfm[keep_columns[0]])]
            df2 = df2.loc[~df2[keep_columns[1]].isin(dfm[keep_columns[1]])]
    print(dfm.shape[0], df1.shape[0], df2.shape[0])
    if return_unmatched:
        return (dfm.reset_index(drop=True), df1, df2)
    else:
        return dfm.reset_index(drop=True)

In [237]:
def merge_datasets(df1, df2, add_cols, base_lists, custom_matches = [], return_unmatched = True, name_changes=False):
    df1_original = df1
    df2_original = df2
    
    df1 = df1.dropna(axis=1, how='all')
    df2 = df2.dropna(axis=1, how='all')
    
    keep_columns = [
        list(filter(lambda x: x.endswith('_ID'), df1.columns))[0],
        list(filter(lambda x: x.endswith('_ID'), df2.columns))[0]
    ]
    
    df1 = add_columns(df1, add_cols)
    df2 = add_columns(df2, add_cols)
    
    
    cols = intersect(df1.columns, df2.columns)
    
    df1 = df1[[col for col in df1.columns
               if col in cols or col == keep_columns[0]]]
    df2 = df2[[col for col in df2.columns
               if col in cols or col == keep_columns[1]]]
    
    on_lists = generate_on_lists(cols, base_lists)
    
    if custom_matches:
        on_lists.append(custom_matches)
    
    if name_changes:
        nc_lists = generate_on_lists(cols, [ml for ml in base_lists if "Last.Name" not in ml])
        nc_lists = [nc_list for nc_list in nc_lists if len(nc_list) > 3]
        on_lists.extend(nc_lists)
    
    merged_data = loop_merge(df1, df2, on_lists=on_lists, keep_columns=keep_columns, return_unmatched=return_unmatched)
        
    return(merged_data)

In [238]:
base_lists = [
        ['Current.Star', 'Star1', 'Star2', 'Star3', 'Star4', 'Star5','Star6', 'Star7', 'Star8', 'Star9', 'Star10'],
        ['First.Name', 'F4FN'], ['Last.Name', 'F4LN'], ['Appointed.Date'],
        ['Birth.Year', 'Current.Age', 'Current.Age.p1', 'Current.Age.m1', ''],
        ['Middle.Initial', ''], ['Gender', ''], ['Race', ''], ['Suffix.Name', '']
    ]
t1 = merge_datasets(asd, asud, ["F4FN", "F4LN"], base_lists)

******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Birth.Year', 'Appointed.Date', 'Last.Name', 'First.Name']
31170
******
31170 839 0


In [239]:
uhd = pd.read_csv("input/unit-history_demographics.csv.gz")

In [249]:
uhd.columns

Index(['unit_history_ID', 'First.Name', 'Last.Name', 'Middle.Initial',
       'Suffix.Name', 'Appointed.Date', 'Current.Age', 'Gender', 'Race',
       'Star1', 'Star2', 'Star3', 'Star4', 'Star5', 'Star6', 'Star7', 'Star8',
       'Star9', 'Star10', 'Current.Unit'],
      dtype='object')

In [250]:
asd.columns

Index(['all_sworn_ID', 'First.Name', 'Last.Name', 'Middle.Initial',
       'Suffix.Name', 'Appointed.Date', 'Birth.Year', 'Gender', 'Race'],
      dtype='object')

In [252]:
t2 = merge_datasets(asd, uhd, ["F4FN", "F4LN", "BY_to_CA", "Current.Age"], base_lists, name_changes=True)

******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Current.Age.p1', 'Appointed.Date', 'Last.Name', 'First.Name']
1311
******
******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Current.Age.m1', 'Appointed.Date', 'Last.Name', 'First.Name']
4287
******
******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Current.Age.m1', 'Appointed.Date', 'F4LN', 'First.Name']
2
******
******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Appointed.Date', 'Last.Name', 'First.Name']
1
******
******
['Suffix.Name', 'Race', 'Gender', 'Current.Age.p1', 'Appointed.Date', 'Last.Name', 'First.Name']
6014
******
******
['Suffix.Name', 'Race', 'Gender', 'Current.Age.m1', 'Appointed.Date', 'Last.Name', 'First.Name']
19934
******
******
['Suffix.Name', 'Race', 'Gender', 'Current.Age.m1', 'Appointed.Date', 'Last.Name', 'F4FN']
1
******
******
['Suffix.Name', 'Race', 'Gender', 'Current.Age.m1', 'Appointed.Date', 'F4LN', 'First.Name']
4
******
******
['Suffix.Name', 'Race', 'Gender', 'App

In [253]:
amd = pd.read_csv("input/all-members_demographics.csv.gz")
amd = amd[amd['First.Name'].notnull()]
t3 = merge_datasets(asd, amd, ["F4FN", "F4LN"], base_lists, name_changes=True)

******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Birth.Year', 'Appointed.Date', 'Last.Name', 'First.Name']
31914
******
******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Birth.Year', 'Appointed.Date', 'F4LN', 'First.Name']
3
******
******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Appointed.Date', 'Last.Name', 'First.Name']
1
******
******
['Suffix.Name', 'Gender', 'Middle.Initial', 'Birth.Year', 'Appointed.Date', 'Last.Name', 'First.Name']
1
******
******
['Suffix.Name', 'Race', 'Gender', 'Middle.Initial', 'Birth.Year', 'Appointed.Date', 'First.Name']
8
******
31927 82 1


In [242]:
def StarMerge(df1, df2, on_list, keep_columns, return_unmerged = True):
    dfm = pd.DataFrame(columns = keep_columns + ['Match'])
    df2 = df2.loc[(df2["Star"].notnull()) & (df2["Star"] > 0)]
    stars = ["Star" + str(i) for i in range(1,11)]
    for mc_cols in on_list:
        for star in stars:
            mc1 = mc_cols + [star]
            mc2 = mc_cols + ["Star"]
            df1t = RemoveDuplicates(df1.loc[(df1[star].notnull()) & (df1[star] > 0), keep_columns[:1] + mc1], mc1)
            df2t = RemoveDuplicates(df2[keep_columns[1:] + mc2], mc2)
            dfmt = df1t.merge(df2t, left_on = mc1, right_on = mc2, how='inner')
            if dfmt.shape[0] > 0:
                print('******')
                print(mc1)
                print(dfmt.shape[0])
                print('******')
                dfmt['Match'] = '-'.join(mc + [star + "/" + "Star"])
                dfm = dfm.append(dfmt[keep_columns + ['Match']].reset_index(drop=True))
                df1 = df1.loc[~df1[keep_columns[0]].isin(dfm[keep_columns[0]])]
                df2 = df2.loc[~df2[keep_columns[1]].isin(dfm[keep_columns[1]])]
    if return_unmerged:
        return (dfm.reset_index(drop=True), df1, df2)
    else:
        return dfm.reset_index(drop=True)

In [221]:
[ml for ml in base_lists if "Last.Name" not in ml]

[['Current.Star',
  'Star1',
  'Star2',
  'Star3',
  'Star4',
  'Star5',
  'Star6',
  'Star7',
  'Star8',
  'Star9',
  'Star10'],
 ['First.Name', 'F4FN'],
 ['Appointed.Date'],
 ['Birth.Year', 'Current.Age', 'Current.Age.p1', 'Current.Age.m1', ''],
 ['Middle.Initial', ''],
 ['Gender', ''],
 ['Race', ''],
 ['Suffix.Name', '']]