In [444]:
# import os
# os.chdir("../")

In [445]:
import pandas as pd
from datamatch import ThresholdMatcher, ColumnsIndex
from pandas_dedupe import dedupe_dataframe
import re
from lib.rows import duplicate_row

In [446]:
def read_data():
    df = pd.read_csv("data/clean/cprr_new_orleans_da_2016_2020.csv")
    return df 

In [447]:
df = read_data()

In [448]:
def create_clusters(df):
    # filter and group uids with tracking number
    df = df.loc[df.tracking_id.notna(), ['uid', 'tracking_id']].groupby("tracking_id").filter(lambda x: len(x) >= 2)
    df = pd.DataFrame(
        # we're storing set of uids under "uids" column because set operations
        # are very fast compared to string operations
        ((tn, set(sr.tolist())) for tn, sr in df.groupby(['tracking_id'])['uid']),
        columns=['tracking_id', 'uids']
    ).set_index('tracking_id', drop=True)
    # print(df.head(10))
    # our custom scoring function
    def scorer(a: pd.Series, b: pd.Series) -> float:
        # x_len is the number of shared uid between 2 tracking numbers
        x_len = len(a.uids & b.uids)
        # if they only share 1 uid then they are not a match
        if x_len < 2:
            return 0
        # returns the ratio of shared uids
        # if the uids are exactly the same then the ratio will be 1
        # if no uid are shared then the ratio will be 0
        # but of course, we have already discarded such pairs in the above condition
        return x_len * 2 / (len(a.uids) + len(b.uids))

    matcher = ThresholdMatcher(
        # Splitting the "uids" column into distinct uid and index with those
        # distinct uid because index_elements is set to True.
        # As always, only the tracking_number that share a uid will be matched
        index=ColumnsIndex('uids', index_elements=True),
        # use the above scoring function
        scorer=scorer,
        dfa=df,
        show_progress=True
    )
    decision = 0.1
    matcher.save_clusters_to_excel(("notebooks/analysis/nopd_allegation_clusters_kh.xlsx"), decision, decision
    )
    
    return matcher.get_clusters_within_threshold(decision)

In [449]:
match = create_clusters(df)
clusters = pd.read_excel("notebooks/analysis/nopd_allegation_clusters_kh.xlsx")

scoring pairs: 4626it [00:00, 4711.19it/s]


In [450]:
clusters = clusters.drop(columns=["cluster_idx", "pair_idx", "sim_score"])
clusters = clusters.rename(columns={"row_key": "tracking_id"})

In [451]:
def split_rows_with_multiple_uids(df):
    df.loc[:, "uids"] = df.uids.str.replace(r"({|}|\')", "", regex=True)
    i = 0
    for idx in df[df.uids.str.contains(",")].index:
        s = df.loc[idx + i, "uids"]
        parts = re.split(r"\s*(?:\,)\s*", s)
        df = duplicate_row(df, idx + i, len(parts))
        for j, name in enumerate(parts):
            df.loc[idx + i + j, "uids"] = name
        i += len(parts) - 1
    return df.rename(columns={"uids": "uid"})

In [452]:
clusters = clusters.pipe(split_rows_with_multiple_uids)
clusters

Unnamed: 0,tracking_id,uid
0,2018-0103-p,51c9616b78b6d51ab0162b3c99c0a8f7
1,2018-0103-p,f9b87007b0659623ed3f4acaad079614
2,2019-0653-r,51c9616b78b6d51ab0162b3c99c0a8f7
3,2019-0653-r,f9b87007b0659623ed3f4acaad079614
4,2017-0459-p,51c9616b78b6d51ab0162b3c99c0a8f7
...,...,...
940,2020-0006-p,70df740e11507f5c8b1c17661022c20d
941,2020-0006-p,4965dd2847f92c5b1d87357714b87e7f
942,2020-0006-p,bcdcfe4fd70b4e576f2500573c026ffb
943,2020-0321-r,4965dd2847f92c5b1d87357714b87e7f


In [453]:
df = df[["allegation_uid", "tracking_id", "uid"]]
clusters = pd.merge(clusters, df, on=["uid", "tracking_id"])
clusters = clusters.drop_duplicates(subset=["allegation_uid"])
clusters

Unnamed: 0,tracking_id,uid,allegation_uid
0,2018-0103-p,51c9616b78b6d51ab0162b3c99c0a8f7,9c3001355b695b9256fe73a43009cb50
2,2018-0103-p,f9b87007b0659623ed3f4acaad079614,4e0e2eec17beae720b0a3c0a76c3e5ee
3,2018-0103-p,f9b87007b0659623ed3f4acaad079614,3c2de851e1b7387f1e003b85d9f99867
4,2018-0103-p,f9b87007b0659623ed3f4acaad079614,beeed090a1a95a40837bd9d64aeef5e5
8,2019-0653-r,51c9616b78b6d51ab0162b3c99c0a8f7,cdac1a74d4118b4c19f797a764775bee
...,...,...,...
1624,2020-0006-p,bcdcfe4fd70b4e576f2500573c026ffb,7f9224bf128d8d8c8f7cda4d4c184f52
1625,2020-0321-r,4965dd2847f92c5b1d87357714b87e7f,2745849a295984f47543242c09022257
1626,2020-0321-r,4965dd2847f92c5b1d87357714b87e7f,657913bbf519ffae91d983fbde248d53
1627,2020-0321-r,b9cf8d87df2873cfa7728c8283824ee8,a7628035098a7fb9b23d7fdf3f0d6c9b
