In [None]:
import pandas as pd

from packages.utils.generate_data_set import SyntheticMatcherDataset
from packages.pandas.pandas_pipeline import DatasetEvaluator


In [None]:

# Data for df1
data1 = [
    ['ID00005', 'N039', 'E298', 'Q412', 'V409', 'R232'], #TP1
    ['ID00009', 'R822', 'W179', 'H017', 'P323', 'F298'], #TP2
    ['ID00007', 'R449', 'X716', 'M948', 'G667', 'S702'], #TP3
    ['ID00004', 'N002', 'E396', 'N843', 'I458', 'S719'], #TP4
    ['ID10004', 'N002', 'E396', 'N853', 'I623', 'S569'], #FN1
    ['NEW72378', 'J547', 'B222', 'G492', 'R551', 'S490'], #FP1
    ['ID00008', 'N322', 'K685', 'T442', 'C825', 'W967'], #FP2
    ['ID00000', 'W815', 'L281', 'R155', 'F768', 'B914'],
    ['ID00001', 'C172', 'B326', 'X400', 'M508', 'O776'],
    ['ID00002', 'V683', 'C265', 'J127', 'D589', 'F482'],
    ['ID00003', 'E851', 'P721', 'F745', 'D863', 'K229'],
    ['ID00016', 'T873', 'D670', 'U046', 'Z181', 'X621'],
    ['ID00017', 'F327', 'G856', 'E567', 'O929', 'Q721'],
    ['ID00010', 'O283', 'T723', 'Z034', 'V319', 'X338'],
]

# Data for df2
data2 = [
    ['ID00005', 'R746', 'E298', 'Q412', 'L291', 'R232'], #TP1
    ['ID00009', 'R822', 'W179', 'H017', 'P323', 'F298'], #TP2
    ['ID00007', 'Z011', 'X716', 'M948', 'W967', 'S702'], #TP3
    ['ID00004', 'N002', 'E396', 'N843', 'V935', 'S719'], #TP4
    ['ID10004', 'N002', 'E396', 'N553', 'I453', 'S459'], #FN1
    ['NEW80187', 'J547', 'B222', 'G492', 'W673', 'S490'], #FP1
    ['NEW30110', 'N322', 'K685', 'T432', 'C225', 'W967'], #FP2
    ['NEW72832', 'F875', 'Q768', 'H822', 'Z154', 'X678'], 
    ['NEW30110', 'R560', 'C434', 'M687', 'Q689', 'Q863'],
    ['NEW81243', 'R762', 'N687', 'A109', 'K476', 'R637'],
    ['NEW52689', 'A089', 'V733', 'W158', 'A640', 'H331'],
    ['NEW67368', 'Z079', 'J617', 'G878', 'W111', 'Q500'],
    ['NEW72348', 'J547', 'B222', 'G492', 'R551', 'S490'],
    ['NEW34469', 'Y990', 'H898', 'W673', 'L967', 'M829'],
]

# Create DataFrames
columns = [0, 1, 2, 3, 4, 5]
df1 = pd.DataFrame(data1, columns=columns)
df2 = pd.DataFrame(data2, columns=columns)
expected = {'gt': 5, 'tp': 4, 'fp': 2, 'fn': 1}

evaluator = DatasetEvaluator(df1, df2, expected=expected, trim=0, threshold=3)
evaluator.preprocess()
evaluator.evaluate()
evaluator.calculate_statistics()
evaluator.printResults()

preprocess took 0.0022 seconds
evaluate took 0.0008 seconds
calculateStatistics took 0.0000 seconds
Expected: {'gt': 5, 'tp': 4, 'fp': 2, 'fn': 1}
Ground Truth Size: 5
True Positives: 4
False Positives: 2
False Negatives: 1
Precision: 0.6667
Recall: 0.8000


In [11]:
time1 =  578.75
time2 = 114.03

print(f"Pipeline 1 avg time: {time1} sec")
print(f"Pipeline 2 avg time: { time2} sec")

if time1 < time2:
    print(f"{(1 - time1/time2)*100:.2f}% increase time ---> not improvement")
else:
    print(f"-{(1 - time2/time1)*100:.2f}% reduce time ---> improvement")

Pipeline 1 avg time: 578.75 sec
Pipeline 2 avg time: 114.03 sec
-80.30% reduce time ---> improvement


In [None]:
1250 Elapsed Time: 1.72 seconds
2500 Elapsed Time: 7.02 seconds 
5000 Elapsed Time: 28.09 seconds
10000 Elapsed Time: 114.03 seconds


1250 Elapsed Time: 6.29 seconds
2500 Elapsed Time: 26 seconds 
5000 Elapsed Time: 108.56 seconds
10000 Elapsed Time: 578.75 seconds

In [None]:
dataset = SyntheticMatcherDataset(size=500 ,  ground_truth_ratio=0.25, datasets_ratio = (1, 2), true_positive_ratio=0.75, threshold=3)
df1, df2 = dataset.df1, dataset.df2
expected = dataset.expected

evaluator = DatasetEvaluator(df1, df2, expected, threshold=3, trim=0, match_column="id")
evaluator.preprocess()
evaluator.evaluate()
evaluator.calculate_statistics()
evaluator.printResults()

preprocess took 0.1427 seconds
evaluate took 0.7788 seconds
calculateStatistics took 0.0002 seconds
Expected: {'gt': 125, 'tp': 93, 'fp': 69, 'fn': 32}
Ground Truth Size: 125
True Positives: 93
False Positives: 69
False Negatives: 32
Precision: 0.5741
Recall: 0.7440


: 

In [45]:
from itertools import combinations

columns = ['col1', 'col2', 'col3', 'col4', 'col5']

# Store result as (column_triplet, unique_count)
unique_counts = []

for cols in combinations(columns, 3):
    count = df2[list(cols)].agg(''.join, axis=1).nunique()
    unique_counts.append((cols, count))

# Find the combination with the minimum unique count
best_combination = min(unique_counts, key=lambda x: x[1])

print("Best column triplet (least unique values):", best_combination[0])
print("Number of unique combinations:", best_combination[1])

Best column triplet (least unique values): ('col1', 'col2', 'col3')
Number of unique combinations: 10000
