In [1]:
!pip install jellyfish
!pip install faker



In [1]:
import pyspark
import jellyfish
import pandas as pd
import numpy as np
from typing import List
import os
import math
from itertools import combinations, product
import time
from concurrent.futures import ProcessPoolExecutor
from collections import defaultdict
from packages.generateDataSets import SyntheticMatcherDataset
from packages.calculateStatistics import DatasetEvaluator

In [2]:

# Data for df1
data1 = [
    ['ID00005', 'N039', 'E298', 'Q412', 'V409', 'R232'], #TP1
    ['ID00009', 'R822', 'W179', 'H017', 'P323', 'F298'], #TP2
    ['ID00007', 'R449', 'X716', 'M948', 'G667', 'S702'], #TP3
    ['ID00004', 'N002', 'E396', 'N843', 'I458', 'S719'], #TP4
    ['ID10004', 'N002', 'E396', 'N853', 'I623', 'S569'], #FN1
    ['NEW72378', 'J547', 'B222', 'G492', 'R551', 'S490'], #FP1
    ['ID00008', 'N322', 'K685', 'T442', 'C825', 'W967'], #FP2
    ['ID00000', 'W815', 'L281', 'R155', 'F768', 'B914'],
    ['ID00001', 'C172', 'B326', 'X400', 'M508', 'O776'],
    ['ID00002', 'V683', 'C265', 'J127', 'D589', 'F482'],
    ['ID00003', 'E851', 'P721', 'F745', 'D863', 'K229'],
    ['ID00016', 'T873', 'D670', 'U046', 'Z181', 'X621'],
    ['ID00017', 'F327', 'G856', 'E567', 'O929', 'Q721'],
    ['ID00010', 'O283', 'T723', 'Z034', 'V319', 'X338'],
]

# Data for df2
data2 = [
    ['ID00005', 'R746', 'E298', 'Q412', 'L291', 'R232'], #TP1
    ['ID00009', 'R822', 'W179', 'H017', 'P323', 'F298'], #TP2
    ['ID00007', 'Z011', 'X716', 'M948', 'W967', 'S702'], #TP3
    ['ID00004', 'N002', 'E396', 'N843', 'V935', 'S719'], #TP4
    ['ID10004', 'N002', 'E396', 'N553', 'I453', 'S459'], #FN1
    ['NEW80187', 'J547', 'B222', 'G492', 'W673', 'S490'], #FP1
    ['NEW30110', 'N322', 'K685', 'T432', 'C225', 'W967'], #FP2
    ['NEW72832', 'F875', 'Q768', 'H822', 'Z154', 'X678'], 
    ['NEW30110', 'R560', 'C434', 'M687', 'Q689', 'Q863'],
    ['NEW81243', 'R762', 'N687', 'A109', 'K476', 'R637'],
    ['NEW52689', 'A089', 'V733', 'W158', 'A640', 'H331'],
    ['NEW67368', 'Z079', 'J617', 'G878', 'W111', 'Q500'],
    ['NEW72348', 'J547', 'B222', 'G492', 'R551', 'S490'],
    ['NEW34469', 'Y990', 'H898', 'W673', 'L967', 'M829'],
]

# Create DataFrames
columns = ['id', 'col1', 'col2', 'col3', 'col4', 'col5']
df1 = pd.DataFrame(data1, columns=columns)
df2 = pd.DataFrame(data2, columns=columns)
expected = {'gt': 5, 'tp': 4, 'fp': 2, 'fn': 1}

evaluator = DatasetEvaluator(df1, df2, expected, threshold=3, match_column='id')
evaluator.evaluate()
# evaluator.calculateStatistics()
evaluator.printResults()

Expected: {'gt': 5, 'tp': 4, 'fp': 2, 'fn': 1}
Ground Truth Size: 5
True Positives: 4
False Positives: 0
False Negatives: 1
Precision: 1.0000
Recall: 0.8000
Elapsed Time: 0.00 seconds


In [4]:
dataset = SyntheticMatcherDataset(size=1250 , true_positive_ratio=0.70, threshold=3)
df1, df2 = dataset.df1, dataset.df2
expected = dataset.expected

evaluator = DatasetEvaluator(df1, df2, expected, threshold=3, match_column="id")
evaluator.evaluate()
evaluator.printResults()

Expected: {'gt': 312, 'tp': 218, 'fp': 94, 'fn': 94}
Ground Truth Size: 312
True Positives: 218
False Positives: 94
False Negatives: 94
Precision: 0.6987
Recall: 0.6987
Elapsed Time: 1.78 seconds


In [None]:
1250 Elapsed Time: 6.29 seconds
2500 Elapsed Time: 26 seconds 
5000 Elapsed Time: 108.56 seconds
10000 Elapsed Time: 578.75 seconds

In [2]:
import timeit

def pipeline1():
    dataset = SyntheticMatcherDataset(size=1000 , true_positive_ratio=0.70, threshold=3)
    df1, df2 = dataset.df1, dataset.df2
    expected = dataset.expected
    
    unique_tokens = pd.unique(pd.concat([df1.iloc[:, 1:6], df2.iloc[:, 1:6]], axis=0).stack())
    token_map = {token: idx for idx, token in enumerate(unique_tokens)}
    
    def map_row(row):
        return [token_map[val] for val in row[1:6]]
        
    # Keep column 0 as-is
    df1_ids = df1.iloc[:, [0]]
    
    # Apply mapping only on columns 1 to 5
    df1_mapped = df1.iloc[:, 1:6].apply(map_row, axis=1, result_type='expand')
    
    # Concatenate back with column 0
    df1_final = pd.concat([df1_ids, df1_mapped], axis=1)
    
    # Repeat for df2
    df2_ids = df2.iloc[:, [0]]
    df2_mapped = df2.iloc[:, 1:6].apply(map_row, axis=1, result_type='expand')
    df2_final = pd.concat([df2_ids, df2_mapped], axis=1)
        
    evaluator = DatasetEvaluator(df1, df2, expected, threshold=3, match_column="id")
    evaluator.evaluate()
    # evaluator.printResults()

def pipeline2():
    dataset = SyntheticMatcherDataset(size=1000 , true_positive_ratio=0.70, threshold=3)
    df1, df2 = dataset.df1, dataset.df2
    expected = dataset.expected
    
    evaluator = DatasetEvaluator(df1, df2, expected, threshold=3, match_column="id")
    evaluator.evaluate()
    # evaluator.printResults()

time1 = timeit.timeit(pipeline1, number=10)
time2 = timeit.timeit(pipeline2, number=10)

print(f"Pipeline 1 avg time: {time1 / 10:.4f} sec")
print(f"Pipeline 2 avg time: {time2 / 10:.4f} sec")

if time1 < time2:
    print(f"{1 - time1/time2:.2f}% increase time ---> not improvement")
else:
    print(f"-{1 - time2/time1:.2f}% reduce time ---> improvement")
    

Pipeline 1 avg time: 6.8511 sec
Pipeline 2 avg time: 5.4194 sec
-0.21% reduce time ---> improvement


In [7]:
Pipeline 1 avg time: 6.2479 sec
Pipeline 2 avg time: 6.0408 sec

Expected: {'gt': 125, 'tp': 87, 'fp': 38, 'fn': 38}
Ground Truth Size: 125
True Positives: 87
False Positives: 38
False Negatives: 38
Precision: 0.6960
Recall: 0.6960
Elapsed Time: 1.26 seconds

1746562494.410388

In [38]:
if time1 < time2:
    print(f"{1 - time1/time2:.2f}% increase time ---> not improvement")
else:
    print(f"-{1 - time2/time1:.2f}% reduce time ---> improvement")
    

-0.03% reduce time ---> improvement


0.9668528625618208