In [109]:
import pandas as pd
from datamatch import ThresholdMatcher, ColumnsIndex
from pandas_dedupe import dedupe_dataframe

In [110]:
def read_data():
    df = pd.read_csv("../data/clean/cprr_new_orleans_da_2016_2020.csv")
    return df 

In [111]:
df = read_data()

In [112]:
def create_clusters(df):
    # filter and group uids with tracking number
    df = df.loc[df.tracking_id.notna(), ['uid', 'tracking_id']]\
        .drop_duplicates()\
        .groupby('tracking_id').filter(lambda x: len(x) >= 2)
    df = pd.DataFrame(
        # we're storing set of uids under "uids" column because set operations
        # are very fast compared to string operations
        ((tn, set(sr.tolist())) for tn, sr in df.groupby('tracking_id')['uid']),
        columns=['tracking_id', 'uids']
    ).set_index('tracking_id', drop=True)
    print(df.head(10))
    #                                                               uids
    # tracking_number
    # 1999-0001-r      {fbe767d5226abd0ad6a5465d2f4ca1c8, 137328460cc...
    # 1999-0518-r      {77268e8c7996e93bed5dc4c1dc99fcc4, 77660eca405...
    # 2001-0519-c      {44da9db0f4c0705a37bacda5fba54219, 4225c320079...
    # 2002-0125-c      {3263d4b5b2790fe60b7a4d89ac76209e, fe0ea64005a...
    # 2003-0441-r      {3d6163a35d00431dc1a3044b207e471f, fe0ea64005a...
    # ...                                                            ...
    # 2019-0093-r      {c75e6823cc3de53f0c4129af057acb4a, da5d8837e26...
    # 2019-0095-n      {defb38a6722004c4dd6a7377946a3082, ca9c3b0f3ca...
    # 2019-0100-p      {3c72799f0a8079f0d080bbf2004dd860, c8283035ac0...
    # 2019-0109-p      {0717b9f92b35d1bca92c01a3a447ef31, 0d9a340272e...
    # 2019-0112-p      {29f585a7614321b970e865092bdb553c, b3cea4d1738...
    #
    # [2737 rows x 1 columns]

    # our custom scoring function
    def scorer(a: pd.Series, b: pd.Series) -> float:
        # x_len is the number of shared uid between 2 tracking numbers
        x_len = len(a.uids & b.uids)
        # if they only share 1 uid then they are not a match
        if x_len < 2:
            return 0
        # returns the ratio of shared uids
        # if the uids are exactly the same then the ratio will be 1
        # if no uid are shared then the ratio will be 0
        # but of course, we have already discarded such pairs in the above condition
        return x_len * 2 / (len(a.uids) + len(b.uids))

    matcher = ThresholdMatcher(
        # Splitting the "uids" column into distinct uid and index with those
        # distinct uid because index_elements is set to True.
        # As always, only the tracking_number that share a uid will be matched
        index=ColumnsIndex('uids', index_elements=True),
        # use the above scoring function
        scorer=scorer,
        dfa=df,
        show_progress=True
    )
    # decision is set to very low (but above 0)
    # to include pairs that score as low as 0.307692
    decision = 0.1
    matcher.save_clusters_to_excel(("analysis/nopd_allegation_clusters_kh.xlsx"), decision, decision
    )
    return matcher.get_clusters_within_threshold(decision)

clusters = create_clusters(df)
print(clusters)
    #                                                                                          uids
    # cluster_idx pair_idx sim_score row_key                                                       
    # 0           0        1.000000  2010-1039-i  {81ce213189ee816017229582c6d337ad, 009cbd42a2d...
    #                                2011-0565-c  {81ce213189ee816017229582c6d337ad, 009cbd42a2d...
    # 1           0        1.000000  2007-0510-c  {8e76e6643edc61dd8a6958a25a347c49, 40c2bcc1073...
    #                                2009-1223-n  {8e76e6643edc61dd8a6958a25a347c49, 40c2bcc1073...
    # 2           0        1.000000  2006-0195-n  {2d2cad8b9201f9d047ed71b5f54265f1, 7fc8478fc39...
    # ...                                                                                       ...
    # 345         0        0.307692  2018-0347-p  {a5b0b01a80ec49ecf72f1a270b8bfcbd, 12d38303dd2...
    # 346         0        0.307692  2013-0644-r  {ad896612811f86e9e6cd8e4bd40cadb9, 8a26726c831...
    #                                2014-0775-r  {dbbf11e24b26c9fd6fba88efa78c7581, 1142f363167...
    # 347         0        0.307692  2006-0449-c  {53b5a1a4f69b655d689a1ef9fdf2492f, 1a610dbe081...
    #                                2008-0165-n  {75fef3bbdf8abc3b622947f0a27d10d1, 18c1402fa57...

    # [1892 rows x 1 columns]
# clusters.to_csv(('analysis/allegation_clusters_nopd_by_tracking_number_kh.csv'), index=False)

                                                          uids
tracking_id                                                   
2016-0002-p  {434b621243d937ef55512025f0d58db1, 0ab2aef4eca...
2016-0003-p  {e6fca608c23c46d70382a399583dd92f, 7dfafbf330d...
2016-0011-n  {1e627908dda6e4daf06d731626847c08, 3340f27a6e7...
2016-0012-p  {a105afa36afe06e9767ac37dd1855d4e, 6b77433bdd5...
2016-0017-p  {8d754fd559d5a5ee01c6dc7d2449ab74, 837ad8a9382...
2016-0021-p  {1e627908dda6e4daf06d731626847c08, 3340f27a6e7...
2016-0026-p  {e7cb1a1d0479532fe16e18d371404e1e, 3748477a4db...
2016-0031-p  {9679eb18d20c8deadce08ef195567b15, b5993f42a85...
2016-0032-r  {9d17a33032a02b75f229af641199097e, 2fd8d61bb52...
2016-0033-n  {32c5967f611b1944c848aa45edc8779b, 2ad879caff0...


scoring pairs: 2733it [00:00, 6568.08it/s]


                                                                                         uids
cluster_idx pair_idx sim_score row_key                                                       
0           0        1.000000  2019-0674-p  {3e36baf29d012aaa856dd542f6c2417e, ecce1003ca7...
                               2020-0165-p  {3e36baf29d012aaa856dd542f6c2417e, ecce1003ca7...
            1        1.000000  2019-0444-r  {3e36baf29d012aaa856dd542f6c2417e, ecce1003ca7...
                               2020-0165-p  {3e36baf29d012aaa856dd542f6c2417e, ecce1003ca7...
            2        1.000000  2019-0444-r  {3e36baf29d012aaa856dd542f6c2417e, ecce1003ca7...
...                                                                                       ...
63          0        0.444444  2020-0261-r  {3bdf1d0d12052ba2c3609c74ef41c35c, c8ac774d0c6...
64          0        0.444444  2016-0289-p  {f0f7b28597e5d775f621e7b0e0e70b01, 7e7b7ca7cb3...
                               2016-0464-p  {97236c599842473

In [114]:
dfa = read_data()
clusters = cluster(dfa)
clusters

Importing data ...


  df[i] = df[i].str.replace('[^\w\s\.\-\(\)\,\:\/\\\\]','')
uids : b66c2280e5a4f2dfe84132bdff621053, 6c4464b737f36f8a7cb98d8583c6812b, 2ad879caff05957ef76a0ed30c8b56af, 13d5ef70954980f90f0d5ca75c1fee3a, a21ac925aff5528824450f8acbce145f

uids : b66c2280e5a4f2dfe84132bdff621053, 6c4464b737f36f8a7cb98d8583c6812b, 2ad879caff05957ef76a0ed30c8b56af, 13d5ef70954980f90f0d5ca75c1fee3a, a21ac925aff5528824450f8acbce145f, 6b6a7b9dd06135d4538513a444ba73d1

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


Starting active labeling...


In [None]:
yes