In [1]:
!pip install jellyfish
!pip install faker



In [103]:
import pyspark
import jellyfish
import pandas as pd
import numpy as np
from typing import List
import os
import math
import time
from collections import defaultdict
from packages.generateDataSets import SyntheticMatcherDataset
from packages.calculateStatistics import DatasetEvaluator
from itertools import combinations

In [2]:

# Data for df1
data1 = [
    ['ID00005', 'N039', 'E298', 'Q412', 'V409', 'R232'], #TP1
    ['ID00009', 'R822', 'W179', 'H017', 'P323', 'F298'], #TP2
    ['ID00007', 'R449', 'X716', 'M948', 'G667', 'S702'], #TP3
    ['ID00004', 'N002', 'E396', 'N843', 'I458', 'S719'], #TP4
    ['ID10004', 'N002', 'E396', 'N853', 'I623', 'S569'], #FN1
    ['NEW72378', 'J547', 'B222', 'G492', 'R551', 'S490'], #FP1
    ['ID00008', 'N322', 'K685', 'T442', 'C825', 'W967'], #FP2
    ['ID00000', 'W815', 'L281', 'R155', 'F768', 'B914'],
    ['ID00001', 'C172', 'B326', 'X400', 'M508', 'O776'],
    ['ID00002', 'V683', 'C265', 'J127', 'D589', 'F482'],
    ['ID00003', 'E851', 'P721', 'F745', 'D863', 'K229'],
    ['ID00016', 'T873', 'D670', 'U046', 'Z181', 'X621'],
    ['ID00017', 'F327', 'G856', 'E567', 'O929', 'Q721'],
    ['ID00010', 'O283', 'T723', 'Z034', 'V319', 'X338'],
]

# Data for df2
data2 = [
    ['ID00005', 'R746', 'E298', 'Q412', 'L291', 'R232'], #TP1
    ['ID00009', 'R822', 'W179', 'H017', 'P323', 'F298'], #TP2
    ['ID00007', 'Z011', 'X716', 'M948', 'W967', 'S702'], #TP3
    ['ID00004', 'N002', 'E396', 'N843', 'V935', 'S719'], #TP4
    ['ID10004', 'N002', 'E396', 'N553', 'I453', 'S459'], #FN1
    ['NEW80187', 'J547', 'B222', 'G492', 'W673', 'S490'], #FP1
    ['NEW30110', 'N322', 'K685', 'T432', 'C225', 'W967'], #FP2
    ['NEW72832', 'F875', 'Q768', 'H822', 'Z154', 'X678'], 
    ['NEW30110', 'R560', 'C434', 'M687', 'Q689', 'Q863'],
    ['NEW81243', 'R762', 'N687', 'A109', 'K476', 'R637'],
    ['NEW52689', 'A089', 'V733', 'W158', 'A640', 'H331'],
    ['NEW67368', 'Z079', 'J617', 'G878', 'W111', 'Q500'],
    ['NEW72348', 'J547', 'B222', 'G492', 'R551', 'S490'],
    ['NEW34469', 'Y990', 'H898', 'W673', 'L967', 'M829'],
]

# Create DataFrames
columns = ['id', 'col1', 'col2', 'col3', 'col4', 'col5']
df1 = pd.DataFrame(data1, columns=columns)
df2 = pd.DataFrame(data2, columns=columns)
expected = {'gt': 5, 'tp': 4, 'fp': 2, 'fn': 1}

evaluator = DatasetEvaluator(df1, df2, expected, threshold=3, match_column='id')
evaluator.evaluate()
# evaluator.calculateStatistics()
evaluator.printResults()

Expected: {'gt': 5, 'tp': 4, 'fp': 2, 'fn': 1}
Ground Truth Size: 5
True Positives: 0
False Positives: 0
False Negatives: 5
Precision: 0.0000
Recall: 0.0000
Elapsed Time: 0.00 seconds


In [11]:
time1 =  578.75
time2 = 114.03

print(f"Pipeline 1 avg time: {time1} sec")
print(f"Pipeline 2 avg time: { time2} sec")

if time1 < time2:
    print(f"{(1 - time1/time2)*100:.2f}% increase time ---> not improvement")
else:
    print(f"-{(1 - time2/time1)*100:.2f}% reduce time ---> improvement")

Pipeline 1 avg time: 578.75 sec
Pipeline 2 avg time: 114.03 sec
-80.30% reduce time ---> improvement


In [None]:
1250 Elapsed Time: 1.72 seconds
2500 Elapsed Time: 7.02 seconds 
5000 Elapsed Time: 28.09 seconds
10000 Elapsed Time: 114.03 seconds


1250 Elapsed Time: 6.29 seconds
2500 Elapsed Time: 26 seconds 
5000 Elapsed Time: 108.56 seconds
10000 Elapsed Time: 578.75 seconds

In [101]:
dataset = SyntheticMatcherDataset(size=500 ,  ground_truth_ratio=0.25, datasets_ratio = (1, 1), true_positive_ratio=0.75, threshold=3)
df1, df2 = dataset.df1, dataset.df2
expected = dataset.expected

evaluator = DatasetEvaluator(df1, df2, expected, threshold=3, match_column="id")
evaluator.evaluate()
evaluator.printResults()

Expected: {'gt': 125, 'tp': 93, 'fp': 69, 'fn': 32}
Ground Truth Size: 125
True Positives: 93
False Positives: 69
False Negatives: 32
Precision: 0.5741
Recall: 0.7440
Elapsed Time: 0.33 seconds


In [104]:
def fast_chunk(s):
    bs = s.encode('utf-8')
    chunk_len = len(bs) // 4 * 4  # ensure multiple of 4
    return np.frombuffer(bs[:chunk_len], dtype='S4')
    
# Step 1: Find best 3-column combination (least unique combinations)
columns = ['col1', 'col2', 'col3', 'col4', 'col5']
unique_counts = []

for cols in combinations(columns, 3):
    count = df2[list(cols)].astype(str).agg(''.join, axis=1).nunique()
    unique_counts.append((cols, count))

best_combination = min(unique_counts, key=lambda x: x[1])
best_cols = list(best_combination[0])

# Step 2: Build index based on best 3 columns
df2['index_key'] = df2[best_cols].astype(str).agg(''.join, axis=1)
df2_proc = df2.apply(lambda x: (x['id'], ''.join(map(str, x[1:6])), x['index_key']), axis=1).to_numpy()

df1['index_key'] = df1[best_cols].astype(str).agg(''.join, axis=1)
df1_proc = df1.apply(lambda x: (x['id'], ''.join(map(str, x[1:6])), x['index_key']), axis=1).to_numpy()


# Step 3: Build hashed buckets from df2 (by index_key)
hashed_bucket = defaultdict(lambda: defaultdict(list))

for id_val, full_str, index_key in df2_proc:
    hashed_bucket[index_key][full_str].append(id_val)

# Precompute chunked df1 strings once
chunked_df1 = [(fast_chunk(hash), fast_chunk(combined), match_id) for match_id, combined, hash in df1_proc]

start = time.time()

for index_key, df1_chunks, match_id in chunked_df1:
    if key in hashed_bucket:
        if np.count_nonzero(index_key == fast_chunk(key)) == 0:
            continue

        for full_str in hashed_bucket[key]:
            df2_chunks = fast_chunk(full_str)
            if len(df1_chunks) != len(df2_chunks):
                continue

            match_count = np.count_nonzero(df1_chunks == df2_chunks)
            if match_count >= 3:
                hashed_bucket[key][full_str].append(match_id)
                break

print(time.time() - start)

flat = []
for subdict in hashed_bucket.values():
    for ids in subdict.values():
        flat.append(ids)

fp, tp, fn = 0, 0, 0

for bucket in flat:
    if len(bucket) > 1:
        ids_to_check = np.array(bucket[1:])
        if (any(np.isin(ids_to_check, ground_truth_ids_np))):
            tp += 1
        else:
            fp += 1

fn = len(ground_truth_ids_np) - tp
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

tp, fn, fp, precision, recall

0.657210111618042


(93, 32, 69, 0.5740740740740741, 0.744)

In [102]:
from itertools import combinations
from collections import defaultdict
import numpy as np
import time

def fast_chunk(s):
    bs = s.encode('utf-8')
    chunk_len = len(bs) // 4 * 4  # ensure multiple of 4
    return np.frombuffer(bs[:chunk_len], dtype='S4')

ground_truth_ids_np = np.intersect1d(df1['id'], df2['id'])
# Step 1: Find best 3-column combination (least unique combinations)
columns = ['col1', 'col2', 'col3', 'col4', 'col5']
unique_counts = []

for cols in combinations(columns, 3):
    count = df2[list(cols)].astype(str).agg(''.join, axis=1).nunique()
    unique_counts.append((cols, count))

best_combination = min(unique_counts, key=lambda x: x[1])
best_cols = list(best_combination[0])

# Step 2: Build index based on best 3 columns
df2['index_key'] = df2[best_cols].astype(str).agg(''.join, axis=1)
df2_proc = df2.apply(lambda x: (x['id'], ''.join(map(str, x[1:6])), x['index_key']), axis=1).to_numpy()

df1['index_key'] = df1[best_cols].astype(str).agg(''.join, axis=1)
df1_proc = df1.apply(lambda x: (x['id'], ''.join(map(str, x[1:6])), x['index_key']), axis=1).to_numpy()



# Step 3: Build hashed buckets from df2 (by index_key)
hashed_bucket = defaultdict(lambda: defaultdict(list))

for id_val, full_str, index_key in df2_proc:
    hashed_bucket[index_key][full_str].append(id_val)

# Precompute chunked df1 strings once
chunked_df1 = [(hash, fast_chunk(combined), match_id) for match_id, combined, hash in df1_proc]

fp, fn, tp = 0, 0 , 0
start = time.time()

for index_key, df1_chunks, match_id in chunked_df1:
    for key in list(hashed_bucket.keys()):
        fast_key1 = fast_chunk(index_key)
        if np.count_nonzero(fast_key1 == fast_chunk(key)) == 0:
            continue

        for full_str in hashed_bucket[key]:
            df2_chunks = fast_chunk(full_str)
            if len(df1_chunks) != len(df2_chunks):
                continue

            match_count = np.count_nonzero(df1_chunks == df2_chunks)
            if match_count >= 3:
                hashed_bucket[key][full_str].append(match_id)
                break

print(time.time() - start)

flat = []
for subdict in hashed_bucket.values():
    for ids in subdict.values():
        flat.append(ids)

for bucket in flat:
    if len(bucket) > 1:
        ids_to_check = np.array(bucket[1:])
        mask = np.isin(ids_to_check, ground_truth_ids_np)
        matched |= set(ids_to_check[mask])
        fp += mask.size - np.count_nonzero(mask)

tp = len(matched)
fn = len(ground_truth_ids_np) - tp
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

tp, fn, fp, precision, recall

0.9082980155944824


(2327, -2202, 69, 0.9712020033388982, 18.616)

In [2]:
import timeit

def pipeline1():
    dataset = SyntheticMatcherDataset(size=1000 , true_positive_ratio=0.70, threshold=3)
    df1, df2 = dataset.df1, dataset.df2
    expected = dataset.expected
    
    unique_tokens = pd.unique(pd.concat([df1.iloc[:, 1:6], df2.iloc[:, 1:6]], axis=0).stack())
    token_map = {token: idx for idx, token in enumerate(unique_tokens)}
    
    def map_row(row):
        return [token_map[val] for val in row[1:6]]
        
    # Keep column 0 as-is
    df1_ids = df1.iloc[:, [0]]
    
    # Apply mapping only on columns 1 to 5
    df1_mapped = df1.iloc[:, 1:6].apply(map_row, axis=1, result_type='expand')
    
    # Concatenate back with column 0
    df1_final = pd.concat([df1_ids, df1_mapped], axis=1)
    
    # Repeat for df2
    df2_ids = df2.iloc[:, [0]]
    df2_mapped = df2.iloc[:, 1:6].apply(map_row, axis=1, result_type='expand')
    df2_final = pd.concat([df2_ids, df2_mapped], axis=1)
        
    evaluator = DatasetEvaluator(df1, df2, expected, threshold=3, match_column="id")
    evaluator.evaluate()
    # evaluator.printResults()

def pipeline2():
    dataset = SyntheticMatcherDataset(size=1000 , true_positive_ratio=0.70, threshold=3)
    df1, df2 = dataset.df1, dataset.df2
    expected = dataset.expected
    
    evaluator = DatasetEvaluator(df1, df2, expected, threshold=3, match_column="id")
    evaluator.evaluate()
    # evaluator.printResults()

time1 = timeit.timeit(pipeline1, number=10)
time2 = timeit.timeit(pipeline2, number=10)

print(f"Pipeline 1 avg time: {time1 / 10:.4f} sec")
print(f"Pipeline 2 avg time: {time2 / 10:.4f} sec")

if time1 < time2:
    print(f"{1 - time1/time2:.2f}% increase time ---> not improvement")
else:
    print(f"-{1 - time2/time1:.2f}% reduce time ---> improvement")
    

Pipeline 1 avg time: 6.8511 sec
Pipeline 2 avg time: 5.4194 sec
-0.21% reduce time ---> improvement


In [7]:
Pipeline 1 avg time: 6.2479 sec
Pipeline 2 avg time: 6.0408 sec

Expected: {'gt': 125, 'tp': 87, 'fp': 38, 'fn': 38}
Ground Truth Size: 125
True Positives: 87
False Positives: 38
False Negatives: 38
Precision: 0.6960
Recall: 0.6960
Elapsed Time: 1.26 seconds

1746562494.410388

In [38]:
if time1 < time2:
    print(f"{1 - time1/time2:.2f}% increase time ---> not improvement")
else:
    print(f"-{1 - time2/time1:.2f}% reduce time ---> improvement")
    

-0.03% reduce time ---> improvement


0.9668528625618208

In [3]:
{'R746E298Q412L291R232': ['ID00005'], 'R822W179H017P323F298': ['ID00009'], 'Z011X716M948W967S702': ['ID00007'], 'N002E396N843V935S719': ['ID00004'], 'N002E396N553I453S459': ['ID10004']}

{'R746E298Q412L291R232': ['ID00005'],
 'R822W179H017P323F298': ['ID00009'],
 'Z011X716M948W967S702': ['ID00007'],
 'N002E396N843V935S719': ['ID00004'],
 'N002E396N553I453S459': ['ID10004']}

In [45]:
from itertools import combinations

columns = ['col1', 'col2', 'col3', 'col4', 'col5']

# Store result as (column_triplet, unique_count)
unique_counts = []

for cols in combinations(columns, 3):
    count = df2[list(cols)].agg(''.join, axis=1).nunique()
    unique_counts.append((cols, count))

# Find the combination with the minimum unique count
best_combination = min(unique_counts, key=lambda x: x[1])

print("Best column triplet (least unique values):", best_combination[0])
print("Number of unique combinations:", best_combination[1])

Best column triplet (least unique values): ('col1', 'col2', 'col3')
Number of unique combinations: 10000


In [7]:
df1_proc = df1.apply(lambda x: (x[0], ''.join(map(str, x[1:]))), axis=1).to_numpy()
df2_proc = df2.apply(lambda x: (x[0], ''.join(map(str, x[1:]))), axis=1).to_numpy()

In [41]:
asd1 = df2[['col1', 'col2', 'col3']].agg(''.join, axis=1).nunique(), ['col1', 'col2', 'col3']
asd2 = df2[['col2', 'col3', 'col4']].agg(''.join, axis=1).nunique(), ['col2', 'col3', 'col4']
asd3 = df2[['col3', 'col4', 'col5']].agg(''.join, axis=1).nunique(), ['col3', 'col4', 'col5']
min([asd1,asd2,asd3])

(100, ['col1', 'col2', 'col3'])

In [3]:
def fast_chunk(s):
    bs = s.encode('utf-8')
    chunk_len = len(bs) // 4 * 4  # ensure multiple of 4
    return np.frombuffer(bs[:chunk_len], dtype='S4')

array([b'F553', b'F553', b'F553', b'F553', b'F553'], dtype='|S4')

In [46]:
unique_counts

[(('col1', 'col2', 'col3'), 10000),
 (('col1', 'col2', 'col4'), 10000),
 (('col1', 'col2', 'col5'), 10000),
 (('col1', 'col3', 'col4'), 10000),
 (('col1', 'col3', 'col5'), 10000),
 (('col1', 'col4', 'col5'), 10000),
 (('col2', 'col3', 'col4'), 10000),
 (('col2', 'col3', 'col5'), 10000),
 (('col2', 'col4', 'col5'), 10000),
 (('col3', 'col4', 'col5'), 10000)]

In [None]:
hashed_bucket

False

(4, 2, 3, 0.5714285714285714, 0.6666666666666666)

In [81]:
# Evaluate precision/recall
matched = set()
fp = 0
ground_truth_ids_np = np.intersect1d(df1['id'], df2['id'])

for bucket in hashed_bucket.values():
    if len(bucket) > 1:
        ids_to_check = np.array(bucket[1:])  # exclude the original
        mask = np.isin(ids_to_check, ground_truth_ids_np)
        matched |= set(ids_to_check[mask])
        fp += mask.size - np.count_nonzero(mask)

tp = len(matched)
fn = len(ground_truth_ids_np) - tp
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

tp, fn, fp, precision, recall

(0, 6, 0, 0, 0.0)

In [80]:
np.array(list(np.intersect1d(df1['id'], df2['id'])))

array(['ID00001', 'ID00009', 'ID00011', 'ID00013', 'ID00014', 'ID00016'],
      dtype='<U7')

In [66]:
np.array(list(np.intersect1d(df1['id'], df2['id']))).size

6

In [73]:
ids_to_check

array(['ID00017'], dtype='<U7')