In [1]:
!pip install jellyfish
!pip install faker



In [1]:
import pyspark
import jellyfish
import pandas as pd
import numpy as np
from typing import List
import os
import math
from itertools import combinations, product
import time
from concurrent.futures import ProcessPoolExecutor
from collections import defaultdict
from packages.generateDataSets import SyntheticMatcherDataset
from packages.calculateStatistics import DatasetEvaluator

In [2]:

# Data for df1
data2 = [
    ['ID00005', 'N039', 'E298', 'Q412', 'V409', 'R232'], #TP1
    ['ID00009', 'R822', 'W179', 'H017', 'P323', 'F298'], #TP2
    ['ID00007', 'R449', 'X716', 'M948', 'G667', 'S702'], #TP3
    ['ID00004', 'N002', 'E396', 'N843', 'I458', 'S719'], #TP4
    ['ID10004', 'N002', 'E396', 'N853', 'I623', 'S569'], #FN1
    ['NEW72378', 'J547', 'B222', 'G492', 'R551', 'S490'], #FP1
    ['ID00008', 'N322', 'K685', 'T442', 'C825', 'W967'], #FP2
    ['ID00000', 'W815', 'L281', 'R155', 'F768', 'B914'],
    ['ID00001', 'C172', 'B326', 'X400', 'M508', 'O776'],
    ['ID00002', 'V683', 'C265', 'J127', 'D589', 'F482'],
    ['ID00003', 'E851', 'P721', 'F745', 'D863', 'K229'],
    ['ID00016', 'T873', 'D670', 'U046', 'Z181', 'X621'],
    ['ID00017', 'F327', 'G856', 'E567', 'O929', 'Q721'],
    ['ID00010', 'O283', 'T723', 'Z034', 'V319', 'X338'],
]

# Data for df2
data1 = [
    ['ID00005', 'R746', 'E298', 'Q412', 'L291', 'R232'], #TP1
    ['ID00009', 'R822', 'W179', 'H017', 'P323', 'F298'], #TP2
    ['ID00007', 'Z011', 'X716', 'M948', 'W967', 'S702'], #TP3
    ['ID00004', 'N002', 'E396', 'N843', 'V935', 'S719'], #TP4
    ['ID10004', 'N002', 'E396', 'N553', 'I453', 'S459'], #FN1
    ['NEW80187', 'J547', 'B222', 'G492', 'W673', 'S490'], #FP1
    ['NEW30110', 'N322', 'K685', 'T432', 'C225', 'W967'], #FP2
    ['NEW72832', 'F875', 'Q768', 'H822', 'Z154', 'X678'], 
    ['NEW30110', 'R560', 'C434', 'M687', 'Q689', 'Q863'],
    ['NEW81243', 'R762', 'N687', 'A109', 'K476', 'R637'],
    ['NEW52689', 'A089', 'V733', 'W158', 'A640', 'H331'],
    ['NEW67368', 'Z079', 'J617', 'G878', 'W111', 'Q500'],
    ['NEW72348', 'J547', 'B222', 'G492', 'R551', 'S490'],
    ['NEW34469', 'Y990', 'H898', 'W673', 'L967', 'M829'],
]

# Create DataFrames
columns = ['id', 'col1', 'col2', 'col3', 'col4', 'col5']
df1 = pd.DataFrame(data1, columns=columns)
df2 = pd.DataFrame(data2, columns=columns)
expected = {'gt': 5, 'tp': 4, 'fp': 2, 'fn': 1}

evaluator = DatasetEvaluator(df1, df2, expected, threshold=3, match_column='id')
evaluator.evaluate()
evaluator.calculateStatistics()
evaluator.printResults()

{
    "N039E298Q412": {
        "V409R232": [
            "ID00005"
        ]
    },
    "R822W179H017": {
        "P323F298": [
            "ID00009"
        ]
    },
    "R449X716M948": {
        "G667S702": [
            "ID00007"
        ]
    },
    "N002E396N843": {
        "I458S719": [
            "ID00004"
        ]
    },
    "N002E396N853": {
        "I623S569": [
            "ID10004"
        ]
    },
    "J547B222G492": {
        "R551S490": [
            "NEW72378"
        ]
    },
    "N322K685T442": {
        "C825W967": [
            "ID00008"
        ]
    },
    "W815L281R155": {
        "F768B914": [
            "ID00000"
        ]
    },
    "C172B326X400": {
        "M508O776": [
            "ID00001"
        ]
    },
    "V683C265J127": {
        "D589F482": [
            "ID00002"
        ]
    },
    "E851P721F745": {
        "D863K229": [
            "ID00003"
        ]
    },
    "T873D670U046": {
        "Z181X621": [
            "ID00016"
        ]
    },
 

In [2]:
dataset = SyntheticMatcherDataset(size=1000, true_positive_ratio=0.70, threshold=3)
df1, df2 = dataset.df1, dataset.df2
expected = dataset.expected

evaluator = DatasetEvaluator(df1, df2, expected, threshold=3, match_column="id")
evaluator.evaluate()
evaluator.calculateStatistics()
evaluator.printResults()

{'ID00017', 'ID00680', 'ID00711', 'ID00157', 'ID00896', 'ID00814', 'ID00749', 'ID00625', 'ID00751', 'ID00807', 'ID00343', 'ID00822', 'ID00628', 'ID00845', 'ID00336', 'ID00138', 'ID00953', 'ID00371', 'ID00989', 'ID00003', 'ID00351', 'ID00708', 'ID00578', 'ID00842', 'ID00027', 'ID00700', 'ID00107', 'ID00230', 'ID00464', 'ID00154', 'ID00933', 'ID00178', 'ID00120', 'ID00798', 'ID00713', 'ID00106', 'ID00188', 'ID00053', 'ID00272', 'ID00780', 'ID00873', 'ID00346', 'ID00840', 'ID00070', 'ID00165', 'ID00056', 'ID00085', 'ID00615', 'ID00558', 'ID00656', 'ID00029', 'ID00961', 'ID00149', 'ID00954', 'ID00632', 'ID00905', 'ID00179', 'ID00446', 'ID00536', 'ID00795', 'ID00392', 'ID00356', 'ID00323', 'ID00206', 'ID00364', 'ID00957', 'ID00330', 'ID00241', 'ID00936', 'ID00212', 'ID00253', 'ID00887', 'ID00863', 'ID00641', 'ID00851', 'ID00139', 'ID00942', 'ID00201', 'ID00069', 'ID00005', 'ID00101', 'ID00480', 'ID00684', 'ID00499', 'ID00710', 'ID00686', 'ID00203', 'ID00963', 'ID00355', 'ID00059', 'ID00327'

In [3]:
evaluator.hash_buckets

defaultdict(list,
            {'Q016W341': [('ID00000', 'Q016W341N113K454V055')],
             'W876T339': [('ID00001', 'W876T339I480A016G749')],
             'O430T337': [('ID00002', 'O430T337L098L553M842')],
             'Z084Q390': [('ID00003', 'Z084Q390O841L497K401')],
             'U390I768': [('ID00004', 'U390I768R842Z942S692')],
             'T226V975': [('ID00005', 'T226V975C925I336H700')],
             'C873G746': [('ID00006', 'C873G746R836I556F118')],
             'T358H574': [('ID00007', 'T358H574O822G973J834')],
             'C556U456': [('ID00008', 'C556U456R789Z115D946')],
             'K473S123': [('ID00009', 'K473S123I540L594O662')],
             'L311A669': [('ID00010', 'L311A669D311U789Z660')],
             'S050G974': [('ID00011', 'S050G974F178F834T932')],
             'M517A245': [('ID00012', 'M517A245M082K791N933')],
             'Z507S883': [('ID00013', 'Z507S883K273W621U774')],
             'D011S591': [('ID00014', 'D011S591D885D634D106')],
             'V009L754

In [3]:
evaluator.hash_buckets


defaultdict(list,
            {'N039E298': [('ID00005', 'N039E298Q412V409R232')],
             'R822W179': [('ID00009', 'R822W179H017P323F298')],
             'R449X716': [('ID00007', 'R449X716M948G667S702')],
             'N002E396': [('ID00004', 'N002E396N843I458S719'),
              ('ID10004', 'N002E396N853I623S569')],
             'J547B222': [('NEW72378', 'J547B222G492R551S490')],
             'N322K685': [('ID00008', 'N322K685T442C825W967')],
             'W815L281': [('ID00000', 'W815L281R155F768B914')],
             'C172B326': [('ID00001', 'C172B326X400M508O776')],
             'V683C265': [('ID00002', 'V683C265J127D589F482')],
             'E851P721': [('ID00003', 'E851P721F745D863K229')],
             'T873D670': [('ID00016', 'T873D670U046Z181X621')],
             'F327G856': [('ID00017', 'F327G856E567O929Q721')],
             'O283T723': [('ID00010', 'O283T723Z034V319X338')]})

In [8]:
df1, df2

(          0     1     2     3     4     5
 0   ID00000  H057  O528  S989  P371  M235
 1   ID00001  P533  O138  L763  I140  W743
 2   ID00002  C093  J326  F722  X963  A862
 3   ID00003  O524  Z363  R491  E934  P710
 4   ID00004  B954  Z304  Q352  J046  D670
 5   ID00005  V030  A960  R416  P067  F628
 6   ID00006  Y329  L186  Y513  J243  Y319
 7   ID00007  U184  X267  D366  V941  D990
 8   ID00008  O257  T263  P446  Y511  X354
 9   ID00009  T311  K188  T131  L387  Y580
 10  ID00010  F948  C769  N747  D475  H421
 11  ID00011  P442  H406  L610  V542  O809
 12  ID00012  E234  K245  V663  J979  O744
 13  ID00013  H400  M273  I938  P226  K032
 14  ID00014  W445  S097  O298  K351  Z131
 15  ID00015  O069  X547  M369  A533  N813
 16  ID00016  S004  L227  N811  R618  N403
 17  ID00017  A283  H501  N315  K167  B217
 18  ID00018  X057  K592  N553  Z796  W436
 19  ID00019  D263  K011  M491  L281  U462,
             0     1     2     3     4     5
 0     ID00008  O257  T263  P446  B409  X354
 1    