In [None]:
import os

os.chdir("..") 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

In [None]:
%run src/merge.py

# Analysis

Look at specific instances that were given with custom merges. 

Look at all instances where demographic data is changed. 





In [None]:
rd.sup_df.matched_on.value_counts()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def get_fig_ax(title, xlabel, ylabel, plot_size=(20, 10)):
    fig, ax = plt.subplots()
    fig.set_size_inches(plot_size) 

    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel) 

    return fig, ax

def plot_appointed_date(df, title="Officer Profiles - Appointed Dates", xlabel="Appointed Date", 
                        ylabel="Officer Count", period='Y', plot_size=(20, 10)):
    fig, ax = get_fig_ax(title, xlabel, ylabel, plot_size)

    df = df.appointed_date.dt.to_period(period).value_counts().sort_index().plot(kind="bar", ax=ax)

    plt.show()

def plot_appointed_date_by_merge(df, title="Officer Profiles - Appointed Dates Stacked by Merge",
                                 xlabel="Appointed Date", ylabel="Officer Count", plot_size=(20,10),
                                 period="Y"):
    fig, ax = get_fig_ax(title, xlabel, ylabel, plot_size)

    df.assign(merged=df.matched_on.notnull()) \
        .groupby([df['appointed_date'].dt.to_period(period), 'merged']) \
        .agg(pd.Series.nunique)['UID'] \
        .unstack().plot.bar(ax=ax, stacked=True)
    
    plt.show()

cols = ['UID', 'first_name_NS', 'last_name_NS', 'suffix_name', 'middle_initial', 'appointed_date', 'star', 'birth_year', 'race', 'gender', 'resignation_date', 'unit', 'file', 'matched_on']

rd.add_file_column()


In [None]:
plot_appointed_date_by_merge(rd.sup_df)

In [None]:
plot_appointed_date(rd.ref_df[['UID', 'appointed_date']].sort_values('appointed_date').dropna().drop_duplicates(),
                    title='Officer Profiles: All Reference')

# False Positive Merges

### Custom Merges

Check explicitly given merge cols 

In [None]:
matched_query = lambda ls: f"matched_on == '{'-'.join(ls)}'"

def get_match_rows(sup_df, ref_df, match_cols):
    uids = sup_df.query(matched_query(match_cols))["UID"]

    return ref_df[ref_df.UID.isin(uids)][cols].sort_values("UID")


In [None]:
custom_merge_cols = [cols if isinstance(cols, list) else cols['cols'] for cols in cons.loop_merge['custom_merges']]
custom_merge_cols

In [None]:
custom_merges = {'-'.join(cols): get_match_rows(rd.sup_df, rd.ref_df, cols) for cols in custom_merge_cols}

In [None]:
for key in custom_merges.keys():
    print(f"{key}: {custom_merges[key].UID.nunique()} matches")

# False Negatives

Officers that should have matched, but did not

In [None]:
possible_false_negatives = rd.sup_um

In [None]:
plot_appointed_date(possible_false_negatives, period='Y', title="Unmatched Officer Profiles")

### Relax merge constraints, remerge sup_um to ref_df

Similar to multirow loop merge but without the mask at the end, look at possible forms of merges. 

In [None]:
import logging
import sys

local_log = logging.getLogger("test")
local_log.setLevel(logging.INFO)

formatter = logging.Formatter(
    '%(asctime)s[%(levelname)s]: %(message)s',
    datefmt='%Y-%m-%dT%H:%M:%S')

stream_out = logging.StreamHandler(sys.stdout)
stream_out.setFormatter(formatter)

local_log.addHandler(stream_out)

In [None]:
base_OD =   [('star', ['star', '']),
            ('first_name', ['first_name_NS', 'F4FN']),
            ('last_name', ['last_name_NS', 'F4LN']),
            ('appointed_date', ['appointed_date', '']),
            ('birth_year', ['birth_year', 'current_age', 'current_age_p1', 'current_age_m1', '']),
            ('middle_initial', ['middle_initial', '']),
            ('middle_initial2', ['middle_initial2', '']),
            ('gender', ['gender', '']),
            ('race', ['race', '']),
            ('suffix_name', ['suffix_name', '']),
            ('current_unit', ['current_unit', ''])]

rd_test = ReferenceData(ref_df, uid=cons.universal_id, log=local_log) \
    .add_sup_data(possible_false_negatives, add_cols=cons.add_cols, base_OD=base_OD) \
    .loop_merge(custom_merges=[]) \
    .append_to_reference() \
    .add_file_column()

In [None]:
relaxed_matches = {cols: get_match_rows(rd_test.sup_df, rd_test.ref_df, cols.split("-")) 
                   for cols in rd_test.sup_df.matched_on.dropna().unique()}

def get_changes(d, key): 
    df = d[key]
    cols = key.split("-")

    agg_dict = {f"{change_col}s": (change_col, lambda x: set(x.drop_duplicates())) for change_col in df.columns 
                if change_col not in cols}

    return df.groupby(['UID'] + cols) \
        .agg(**agg_dict)

In [None]:
for key in relaxed_matches.keys():
    print(f"{key}: {relaxed_matches[key].UID.nunique()} matches")