In [11]:
# import os
# os.chdir("../")

In [12]:
import pandas as pd
from datamatch import ThresholdMatcher, ColumnsIndex
from pandas_dedupe import dedupe_dataframe
import re
from lib.rows import duplicate_row
import deba

In [13]:
def read_data():
    df = pd.read_csv(deba.data("fuse/allegation.csv"))
    return df 

In [14]:
df = read_data()

  if await self.run_code(code, result, async_=asy):


In [15]:
def create_clusters(df):
    # filter and group uids with tracking number
    df = df.loc[df.tracking_id.notna(), ['uid', 'tracking_id']].groupby("tracking_id").filter(lambda x: len(x) >= 2)
    df = pd.DataFrame(
        # we're storing set of uids under "uids" column because set operations
        # are very fast compared to string operations
        ((tn, set(sr.tolist())) for tn, sr in df.groupby(['tracking_id'])['uid']),
        columns=['tracking_id', 'uids']
    ).set_index('tracking_id', drop=True)
    # print(df.head(10))
    # our custom scoring function
    def scorer(a: pd.Series, b: pd.Series) -> float:
        # x_len is the number of shared uid between 2 tracking numbers
        x_len = len(a.uids & b.uids)
        # if they only share 1 uid then they are not a match
        if x_len < 2:
            return 0
        # returns the ratio of shared uids
        # if the uids are exactly the same then the ratio will be 1
        # if no uid are shared then the ratio will be 0
        # but of course, we have already discarded such pairs in the above condition
        return x_len * 2 / (len(a.uids) + len(b.uids))

    matcher = ThresholdMatcher(
        # Splitting the "uids" column into distinct uid and index with those
        # distinct uid because index_elements is set to True.
        # As always, only the tracking_number that share a uid will be matched
        index=ColumnsIndex('uids', index_elements=True),
        # use the above scoring function
        scorer=scorer,
        dfa=df,
        show_progress=True
    )
    decision = 0.1
    matcher.save_clusters_to_excel(deba.data("analysis/allegation.xlsx"), decision, decision
    )
    
    return matcher.get_clusters_within_threshold(decision)

In [16]:
match = create_clusters(df)
clusters = pd.read_excel(deba.data("analysis/allegation.xlsx"))

scoring pairs: 5894it [00:00, 6308.23it/s]


In [17]:
clusters = clusters.drop(columns=["cluster_idx", "pair_idx", "sim_score"])
clusters = clusters.rename(columns={"row_key": "tracking_id"})

In [18]:
def split_rows_with_multiple_uids(df):
    df.loc[:, "uids"] = df.uids.str.replace(r"({|}|\')", "", regex=True)
    i = 0
    for idx in df[df.uids.str.contains(",")].index:
        s = df.loc[idx + i, "uids"]
        parts = re.split(r"\s*(?:\,)\s*", s)
        df = duplicate_row(df, idx + i, len(parts))
        for j, name in enumerate(parts):
            df.loc[idx + i + j, "uids"] = name
        i += len(parts) - 1
    return df.rename(columns={"uids": "uid"})

In [19]:
clusters = clusters.pipe(split_rows_with_multiple_uids)
clusters

Unnamed: 0,tracking_id,uid
0,3c534259798acc6e420079d1e072f543,3e36baf29d012aaa856dd542f6c2417e
1,3c534259798acc6e420079d1e072f543,ecce1003ca706c7b3d893112cb7ccc75
2,9f3d3c3c3424e6baa52baed7b0cb10bc,3e36baf29d012aaa856dd542f6c2417e
3,9f3d3c3c3424e6baa52baed7b0cb10bc,ecce1003ca706c7b3d893112cb7ccc75
4,3c534259798acc6e420079d1e072f543,3e36baf29d012aaa856dd542f6c2417e
...,...,...
1522,d1356ddced7c750ae9fd9bd14c783fb1,8a1fe08e4fe721e387ccc964d51f230c
1523,d1356ddced7c750ae9fd9bd14c783fb1,8f6a8a487e0ff24d1173298e3d1aa0c4
1524,d1356ddced7c750ae9fd9bd14c783fb1,aea14597b0f40ebb0cd02a1e083d78c8
1525,d1356ddced7c750ae9fd9bd14c783fb1,7be36eb95e087a27e90e1f1ad780a682


In [20]:
df = df[["allegation_uid", "tracking_id", "uid"]]
clusters = pd.merge(clusters, df, on=["uid", "tracking_id"])
clusters = clusters.drop_duplicates(subset=["allegation_uid"])
clusters

Unnamed: 0,tracking_id,uid,allegation_uid
0,3c534259798acc6e420079d1e072f543,3e36baf29d012aaa856dd542f6c2417e,632c6c8ef88352585d1a04ae7cb1402c
2,3c534259798acc6e420079d1e072f543,ecce1003ca706c7b3d893112cb7ccc75,26b92e8986e593c134f6f8b9ac3000bb
4,9f3d3c3c3424e6baa52baed7b0cb10bc,3e36baf29d012aaa856dd542f6c2417e,ec315158e5d51cc07a3a50c521179933
6,9f3d3c3c3424e6baa52baed7b0cb10bc,ecce1003ca706c7b3d893112cb7ccc75,4a48f637117416787255a9fd1125c2d0
8,4ed577964e2144cdb040ffc04428e132,3e36baf29d012aaa856dd542f6c2417e,34be200a87fe1457d8a51dbd039aee89
...,...,...,...
2228,d1356ddced7c750ae9fd9bd14c783fb1,8a1fe08e4fe721e387ccc964d51f230c,b14ebbbf11296a6974b81fc30a445c2a
2229,d1356ddced7c750ae9fd9bd14c783fb1,8f6a8a487e0ff24d1173298e3d1aa0c4,0284e60936a434afd7324467d798538a
2230,d1356ddced7c750ae9fd9bd14c783fb1,aea14597b0f40ebb0cd02a1e083d78c8,f855dfc5ef3e22abe45cc2008d0b1a5e
2231,d1356ddced7c750ae9fd9bd14c783fb1,7be36eb95e087a27e90e1f1ad780a682,77c59384c47b7e5ae6fa38d98e99e316


In [21]:
clusters.to_csv(deba.data("analysis/coaccusals_allegation.csv"), index=False)