In [23]:
# import os
# os.chdir("../")

In [24]:
import pandas as pd
from datamatch import ThresholdMatcher, ColumnsIndex
from pandas_dedupe import dedupe_dataframe
import re
from lib.rows import duplicate_row
import deba

In [25]:
def read_data():
    df = pd.read_csv(deba.data("fuse/allegation.csv"))
    return df 

In [26]:
df = read_data()

  df = pd.read_csv(deba.data("fuse/allegation.csv"))


In [27]:
def create_clusters(df):
    # filter and group uids with tracking number
    df = df.loc[df.tracking_id.notna(), ['uid', 'tracking_id']].groupby("tracking_id").filter(lambda x: len(x) >= 2)
    df = pd.DataFrame(
        # we're storing set of uids under "uids" column because set operations
        # are very fast compared to string operations
        ((tn, set(sr.tolist())) for tn, sr in df.groupby(['tracking_id'])['uid']),
        columns=['tracking_id', 'uids']
    ).set_index('tracking_id', drop=True)
    # print(df.head(10))
    # our custom scoring function
    def scorer(a: pd.Series, b: pd.Series) -> float:
        # x_len is the number of shared uid between 2 tracking numbers
        x_len = len(a.uids & b.uids)
        # if they only share 1 uid then they are not a match
        if x_len < 2:
            return 0
        # returns the ratio of shared uids
        # if the uids are exactly the same then the ratio will be 1
        # if no uid are shared then the ratio will be 0
        # but of course, we have already discarded such pairs in the above condition
        return x_len * 2 / (len(a.uids) + len(b.uids))

    matcher = ThresholdMatcher(
        # Splitting the "uids" column into distinct uid and index with those
        # distinct uid because index_elements is set to True.
        # As always, only the tracking_number that share a uid will be matched
        index=ColumnsIndex('uids', index_elements=True),
        # use the above scoring function
        scorer=scorer,
        dfa=df,
        show_progress=True
    )
    decision = 0.1
    matcher.save_clusters_to_excel(deba.data("analysis/allegation.xlsx"), decision, decision
    )
    
    return matcher.get_clusters_within_threshold(decision)

In [28]:
match = create_clusters(df)
clusters = pd.read_excel(deba.data("analysis/allegation.xlsx"))

scoring pairs: 5972it [00:01, 5631.83it/s]


In [29]:
clusters = clusters.drop(columns=["cluster_idx", "pair_idx", "sim_score"])
clusters = clusters.rename(columns={"row_key": "tracking_id"})

In [30]:
def split_rows_with_multiple_uids(df):
    df.loc[:, "uids"] = df.uids.str.replace(r"({|}|\')", "", regex=True)
    i = 0
    for idx in df[df.uids.str.contains(",")].index:
        s = df.loc[idx + i, "uids"]
        parts = re.split(r"\s*(?:\,)\s*", s)
        df = duplicate_row(df, idx + i, len(parts))
        for j, name in enumerate(parts):
            df.loc[idx + i + j, "uids"] = name
        i += len(parts) - 1
    return df.rename(columns={"uids": "uid"})

In [31]:
clusters = clusters.pipe(split_rows_with_multiple_uids)

clusters

Unnamed: 0,tracking_id,uid
0,b1542c6c7de38acf49b2477fb3114501,29f98edd9908e30a3cdcc6cac36b41e1
1,b1542c6c7de38acf49b2477fb3114501,a27837a6c30cc0a5980ed88c8232b6b8
2,d1ce9b5f51063fc6f28c6ce947592444,29f98edd9908e30a3cdcc6cac36b41e1
3,d1ce9b5f51063fc6f28c6ce947592444,a27837a6c30cc0a5980ed88c8232b6b8
4,4d669f1115d3b76f3901b496f8fb6965,29f98edd9908e30a3cdcc6cac36b41e1
...,...,...
1529,d1356ddced7c750ae9fd9bd14c783fb1,3c15b982f4ac86eaf7db1e9f82361e0a
1530,d1356ddced7c750ae9fd9bd14c783fb1,cf4e6a30431461f685d0883d81cd2a3e
1531,d1356ddced7c750ae9fd9bd14c783fb1,371926c453baf777f00483f1d2699797
1532,d1356ddced7c750ae9fd9bd14c783fb1,774c9cb45d43b2511289ba9f2468189f


In [32]:
df = df[["allegation_uid", "tracking_id", "uid", "agency"]]
clusters = pd.merge(clusters, df, on=["uid", "tracking_id"])
clusters = clusters.drop_duplicates(subset=["allegation_uid"])
clusters

Unnamed: 0,tracking_id,uid,allegation_uid,agency
0,b1542c6c7de38acf49b2477fb3114501,29f98edd9908e30a3cdcc6cac36b41e1,806e386472693535d83dec13c2ef9116,new-orleans-pd
1,b1542c6c7de38acf49b2477fb3114501,a27837a6c30cc0a5980ed88c8232b6b8,5cf4e00bc12adac20e59681b36a26265,new-orleans-pd
2,d1ce9b5f51063fc6f28c6ce947592444,29f98edd9908e30a3cdcc6cac36b41e1,94aa83a117c3a9441ea5309c226d3d07,new-orleans-pd
3,d1ce9b5f51063fc6f28c6ce947592444,29f98edd9908e30a3cdcc6cac36b41e1,ac7330261fab81ea8a0b80fcf00bf1fc,new-orleans-pd
4,d1ce9b5f51063fc6f28c6ce947592444,a27837a6c30cc0a5980ed88c8232b6b8,7e51a6e7ebbfe038fb7b370a50e8f230,new-orleans-pd
...,...,...,...,...
2236,d1356ddced7c750ae9fd9bd14c783fb1,3c15b982f4ac86eaf7db1e9f82361e0a,ef968cc9e077daa447c4ec313f1d51fc,baton-rouge-pd
2237,d1356ddced7c750ae9fd9bd14c783fb1,cf4e6a30431461f685d0883d81cd2a3e,dab7ae3227af010a9b2eabc16a7a71d0,baton-rouge-pd
2238,d1356ddced7c750ae9fd9bd14c783fb1,371926c453baf777f00483f1d2699797,69d2da9226a965f09cdef135381efb73,baton-rouge-pd
2239,d1356ddced7c750ae9fd9bd14c783fb1,774c9cb45d43b2511289ba9f2468189f,464c025bce57aeed05eb601cece74e05,baton-rouge-pd


In [33]:
clusters.to_csv(deba.data("analysis/coaccusals_allegation.csv"), index=False)