In [1]:
!pip install jellyfish
!pip install faker



In [1]:
import pyspark
import jellyfish
import pandas as pd
import numpy as np
from typing import List
import os
import math
from itertools import combinations, product
import time
from concurrent.futures import ProcessPoolExecutor
from collections import defaultdict


In [2]:
class MyClass:
    def __init__(self, df1: pd.DataFrame, 
                 df2: pd.DataFrame, 
                 matchColumn: str, 
                 on: List = [],
                 method: str = 'column', 
                 threshold: float  = 0.6):
        self.df1 = df1
        self.df2 = df2
        self.on = on
        self.threshold = threshold

        if method not in ["concat", "column"]:
            raise ValueError(f"Method '{method}' is not correct.")
        self.method = method

    
        if matchColumn not in self.df1.columns or matchColumn not in self.df2.columns:
            raise ValueError(f"Column '{matchColumn}' is not found in both DataFrames.")
        self.matchColumn = matchColumn
        
        self.groundTruth = None
        self.totalMatches = None        
    
    def setGroundTruth(self):
        """Sets the ground truth based on matching 'id' columns."""
        self.groundTruth = np.intersect1d(self.df1[self.matchColumn], self.df2[self.matchColumn])

    def soundexDfs(self):
        """Apply soundex transformation to non-id columns."""
        for df in [self.df1, self.df2]:
            for col_name in df.columns:
                if col_name != self.matchColumn:
                    df[col_name] = df[col_name].apply(lambda x: jellyfish.soundex(str(x)))

            if self.method == 'concat':
                non_match_columns = [col for col in df.columns if col != self.matchColumn]
                df['concatenated'] = df[non_match_columns].apply(lambda row: ''.join(row.astype(str)), axis=1)
                df.drop(columns=non_match_columns, inplace=True)

    def setTotalMatches(self):
        """Sets the total matches based on merged DataFrames."""
        
        # if self.method == 'concat':
        #     self.totalMatches = self.df1.merge(self.df2, how="inner", on=['concatenated']).to_numpy()
        # else:   
        #     self.totalMatches = self.df1.merge(self.df2, how="outer", on=self.on + [self.matchColumn]).to_numpy()

        self.totalMatches =  self.df1.merge(pd.concat([self.df1, self.df2]), how='outer', on=self.on)[["0_y"] + self.on]
        self.totalMatches.rename(columns={'0_y': self.matchColumn}, inplace=True)
        
    def printStatistics(self):
        """Print statistics (True Positives, False Positives, Precision)."""
        myStatistics = self.Statistics(groundTruth=self.groundTruth, 
                                       totalMatches=self.totalMatches, 
                                       threshold=self.threshold, 
                                       on=self.on, 
                                       matchColumn=self.matchColumn)
        myStatistics.calculate()

    # Inner class Statistics
    class Statistics:
        def __init__(self,
                     groundTruth: pd.DataFrame, 
                     totalMatches: pd.DataFrame, 
                     threshold : float = 0.8,
                     matchColumn: str | int = 1,
                     on: List =[]):
            self.groundTruth = pd.DataFrame(groundTruth)
            self.totalMatches = pd.DataFrame(totalMatches)
            self.threshold = threshold
            self.matchColumn = matchColumn
            self.on = on

            self._setThresholdValues()
            
        def calculate(self):
            # self.result = self.totalMatches.groupby(self.matchColumn)\
            #         .filter(lambda x : len(x) >=2)\
            #         .groupby(self.matchColumn)\
            #         .apply(lambda x: x.iloc[:, 1:].apply(lambda x: x.nunique() == 1)).sum(axis=1)

            duplicates = self.totalMatches[self.totalMatches[[1,2,3,4,5]].duplicated(keep=False)].sort_values(by=[1,2,3,4,5])
        
            # Function to check if two rows match at least 3/5 columns
            def is_duplicate(row1, row2):
                return sum(row1 == row2) >=  self.matchingRows  # At least 3 matches out of 5

            # print(duplicates)
            
            duplicate_pairs = []
            # Find duplicates
            for i in range(len(duplicates)):
                for j in range(i + 1, len(duplicates)):  # Compare only unique pairs
                    if is_duplicate(duplicates.iloc[i, 1:], duplicates.iloc[j, 1:]):
                        duplicate_pairs.append((i, j, duplicates.iloc[i, 0] == duplicates.iloc[j, 0]))  # Store (index1, index2, same_id)

            # Count same ID and different ID duplicates
            tp = sum(1 for _, _, same_id in duplicate_pairs if same_id)
            fp = len(duplicate_pairs) - same_id_count
            fn = self.groundTruth.size - tp
            
            precision = tp / (tp + fp) if tp + fp != 0 else 0 
            recall = tp / (tp + fn)  if tp + fn != 0 else 0
            f1_score = (2 * precision * recall) / (precision + recall)
            
            print("Total Possible Mathces:", self.groundTruth.size)
            print("True Positives (TP):", tp)
            print("False Positives (FP):", fp)
            print("False Negatives (FN):", fn)
            print("Precision:", f"{precision:.4f}")
            print("Recall:", f"{recall:.4f}")
            print("F1-score:", f"{f1_score:.4f}")

        def _matchingAlgorithm(self, group):
            return group.nunique() == 1
            
        def _setThresholdValues(self) -> List:
            size = len(self.totalMatches.columns) - 1
            limit = math.floor(self.threshold * size)
            
            print(f"We accept at least {limit}/{size} as matches!") 
            self.matchingRows = limit
            # return [i for i in range(size, limit , -1)]
            

In [68]:
# Create two datasets with slight variations
# Data have 3 matches and one 
data1 = {
    0: [101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    1: ["Kostas", "Maria", "John", "Sophia", "George", "Eleni", "Michael", "Anna", "Chris", "Dimitris"],
    2: ["Razgkelis", "Papadopoulos", "Smith", "Johnson", "Pavlou", "Nikolaou", "Brown", "Miller", "Taylor", "Andreas"],
    3: ["Orestiada", "Thessaloniki", "Grevena", "Athina", "Aleksandroupoli", "Giannena", "Larissa", "Komotini", "Trikala", "Kozani"],
    4: ['Jennifer Lights', 'Brandon Lakes', 'Aguilar Stravenue', 'Richardson Ferry', 'Freeman Way', 
        'Gabrielle Underpass', 'Burns Summit', 'Heather Village', 'Jamie Common', 'Greg Lock'],
    5:  ['Cooper and Sons', 'Pope LLC', 'Fowler-Smith', 'Torres PLC', 'Jones LLC', 'White, Duncan and Robinson', 'Hayden Inc', 
         'Wilson and Sons', 'Peterson, Smith and Robinson','Hudson, Phelps and Day'],
    
}

data2 = {
    0: [101, 202, 203, 204, 205, 206, 207, 208, 209, 110],
    1: ["Kistas", "Maria", "John", "Sophasdia", "Giorge", "Elendsi", "Micheal", "Ana", "Khris", "Dimtris"],
    2: ["Rozgkliiis", "Papadopoulos", "Smith", "Johnson", "Pavlodvu", "Nikolaou", "Batrrroun", "antMiler", "Tttayloor", "Andres"],
    3: ["Orestiada", "Thessaloniki", "Grevena", "Athina", "Aleksandrouasdpoli", "Gianasdna", "Larasdissa", "Komasdini", "Trsadala", "Koxani"],
    4: ["Jnnfer Lights", 'Brandon Lakes', 'Aguilar Stravenue', 'RichardasasdaFerry', 'Freasdeman Way', 'Gabrielle Underpass', 'Burasdas mmit', 'Heatasasdllage', 'JamasdCommon', 'Grg Lck'],
    5:  ['Cpeeer and ons', 'Pope LLC', 'Fowler-Smith', 'Torvasd PLC', 'Jonasda LLC', 'Whitasddvuncan and Robinson', 'Hayasdasv Inc', 
         'Wasdvand Sons', 'Petersosdvaith and Robinson','Htsn, Phelps and Day'],
}




In [114]:

# Convert to DataFrame
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Run pipeline and see statistics
pipeline = MyClass(df1, df2, matchColumn=0, on=[1, 2, 3, 4, 5], threshold=0.4)
pipeline.setGroundTruth()
pipeline.soundexDfs()
pipeline.setTotalMatches()
# pipeline.printStatistics()

0    101
Name: 0, dtype: int64
0    110
Name: 1, dtype: int64


In [244]:
PATH  =  "data/"

df1 = pd.read_csv(os.path.join(PATH, 'df1.csv'), header=None)[[0,1,2,3,4,5]]
df2 = pd.read_csv(os.path.join(PATH, 'df5.csv'), header=None)[[0,1,2,3,4,5]]

# Run pipeline and see statistics
pipeline = MyClass(df1, df2, matchColumn=0, on=[1,2,3,4,5], method="column", threshold = 0.6) #  --> this means at least 3/5 of the fields must match 
pipeline.setGroundTruth()
pipeline.soundexDfs()
# pipeline.setTotalMatches()
# pipeline.printStatistics()

In [53]:
def is_similar(row1, row2, threshold=4):
    return np.sum(row1[1:] == row2[1:]) >= threshold  # Compare columns 1-5


# Function to process chunks
def process_chunk(chunk_row_pairs, df1, df2, ground_truth):
    totalMatches = []
    for id1, id2 in chunk_row_pairs:
        if is_similar(df1[df1[:, 0] == id1][0], df2[df2[:, 0] == id2][0], 3):
            totalMatches.append((id1, id2))

    # Calculate tp, fp, fn for this chunk
    tp = sum(1 for x, y in totalMatches if x == y and x in ground_truth)
    fp = len(totalMatches) - tp
    fn = len(ground_truth) - tp
    
    return tp, fp, fn



start_time = time.time()

size = 200

df1 = pd.concat([pipeline.df1.iloc[:size], pipeline.df1.iloc[25_000:25000 + size * 3]]).to_numpy()
df2 = pd.concat([pipeline.df2.iloc[:size], pipeline.df2.iloc[25_000:25000 + size * 3]]).to_numpy()

ground_truth = np.intersect1d(df1[:, 0], df2[:, 0])

# Split row pairs into chunks
chunk_size = 1000  # Adjust chunk size based on memory and performance

# Process in chunks
total_tp, total_fp, total_fn = 0, 0, 0
for i in range(0, len(df1), chunk_size):
    # Generate row pairs for the current chunk
    chunk_row_pairs = list(product(df1[i:i + chunk_size, 0], df2[:, 0]))
    tp, fp, fn = process_chunk(chunk_row_pairs, df1, df2, ground_truth)
    
    total_tp += tp
    total_fp += fp
    total_fn += fn

# Calculate precision and recall
precision = total_tp / (total_tp + total_fp) if total_tp + total_fp > 0 else 0
recall = total_tp / (total_tp + total_fn) if total_tp + total_fn > 0 else 0

elapsed_time = time.time() - start_time

print(f"True Positives: {total_tp}")
print(f"False Positives: {total_fp}")
print(f"False Negatives: {total_fn}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Elapsed Time: {elapsed_time:.2f} seconds")

True Positives: 133
False Positives: 5
False Negatives: 67
Precision: 0.9638
Recall: 0.6650
Elapsed Time: 52.30 seconds


In [21]:
pd.concat([pipeline.df1.iloc[:250], pipeline.df1.iloc[25_000:25000 + 750]]).to_numpy().size

6000

In [40]:
sum([7.85, 8.35, 9.89, 8.78, 8.87])

1745433922.1039279

In [245]:
def is_similar(row1, row2, threshold=3):
    return np.sum(row1[1:] == row2[1:]) >= threshold

def process_chunk(chunk_row_pairs, df1_dict, df2_dict, ground_truth, threshold=3):
    total_matches = []
    for id1, id2 in chunk_row_pairs:
        row1 = df1_dict.get(id1)
        row2 = df2_dict.get(id2)
        if row1 is not None and row2 is not None:
            if is_similar(row1, row2, threshold):
                total_matches.append((id1, id2))

    tp = sum(1 for id1, id2 in total_matches if id1 == id2 and id1 in ground_truth)
    fp = len(total_matches) - tp
    
    return tp, fp
    
# Preprocess
size = 500
df1 = pd.concat([pipeline.df1.iloc[:size], pipeline.df1.iloc[25000:25000 + size * 3]]).to_numpy()
df2 = pd.concat([pipeline.df2.iloc[:size], pipeline.df2.iloc[25000:25000 + size * 3]]).to_numpy()

df1_dict = {row[0]: row for row in df1}
df2_dict = {row[0]: row for row in df2}
ground_truth = set(np.intersect1d(df1[:, 0], df2[:, 0]))

chunk_size = 500
total_tp = total_fp = total_fn = 0

start_time = time.time()

for i in range(0, len(df1), chunk_size):
    chunk_ids = df1[i:i + chunk_size, 0]
    chunk_row_pairs = list(product(chunk_ids, df2[:, 0]))
    tp, fp = process_chunk(chunk_row_pairs, df1_dict, df2_dict, ground_truth)
    total_tp += tp
    total_fp += fp

total_fn = len(ground_truth) - total_tp

precision = total_tp / (total_tp + total_fp) if total_tp + total_fp > 0 else 0
recall = total_tp / (total_tp + total_fn) if total_tp + total_fn > 0 else 0

elapsed_time = time.time() - start_time


print(f"True Positives: {total_tp}")
print(f"False Positives: {total_fp}")
print(f"False Negatives: {total_fn}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Elapsed Time: {elapsed_time:.2f} seconds")

True Positives: 330
False Positives: 50
False Negatives: 170
Precision: 0.8684
Recall: 0.6600
Elapsed Time: 26.34 seconds


In [None]:
148.94

In [62]:
len(ground_truth)

2000

In [None]:
n = 5000

True Positives: 3437
False Positives: 1697
False Negatives: 1563
Precision: 0.6695
Recall: 0.6874
Elapsed Time: 2599.92 seconds

n = 15000

True Positives: 9903
False Positives: 8710
False Negatives: 5097
Precision: 0.5320
Recall: 0.6602
Elapsed Time: 26709.04 seconds

In [333]:
# Index df2 by a hash on a few features to reduce comparisons
# For example, use a subset of columns to pre-group similar rows
# Here, we use the first column (ID) as a rough bucket
from collections import defaultdict
import time
import numpy as np
import pandas as pd

def is_similar(row1, row2, threshold=2):
    return np.sum(row1[1:] == row2[1:]) >= threshold

# Preprocess
size = 500
df1 = pd.concat([pipeline.df1.iloc[:size], pipeline.df1.iloc[25000:25000 + size * 3]]).to_numpy()
df2 = pd.concat([pipeline.df2.iloc[:size], pipeline.df2.iloc[25000:25000 + size * 3]]).to_numpy()

ground_truth_ids = set(np.intersect1d(df1[:,0], df2[:,0]))

df2_buckets = defaultdict(list)
for row in df2:
    df2_buckets[row[0]].append(row)  # You can change the key to a composite or substring

start_time = time.time()

matched = set()

for row1 in df1:
    candidates = df2_buckets.get(row1[0], [])
    for row2 in candidates:
        if is_similar(row1, row2, threshold=3):
            matched.add((row1[0], row2[0]))  # Track ID pairs
            break

total_tp = sum(1 for id1, id2 in matched if id1 == id2 and id1 in ground_truth_ids)
total_fp = sum(1 for id1, id2 in matched if id1 != id2 or id1 not in ground_truth_ids)
total_fn = len(ground_truth_ids) - total_tp 

precision = total_tp / (total_tp + total_fp) if total_tp + total_fp > 0 else 0
recall = total_tp / (total_tp + total_fn) if total_tp + total_fn > 0 else 0

elapsed_time = time.time() - start_time

print(f"True Positives: {total_tp}")
print(f"False Positives: {total_fp}")
print(f"False Negatives: {total_fn}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Elapsed Time: {elapsed_time:.2f} seconds")

InvalidIndexError: (slice(None, None, None), 0)

In [None]:
matched

In [19]:
df1[:,0]

array(['AA100000', 'AA100004', 'AA100006', ..., 'AB16959', 'AB16960',
       'AB16962'], dtype=object)

In [362]:
def is_similar(row1, row2, threshold=3):
    return np.sum(np.array(row1) == np.array(row2)) >= threshold

# PATH  =  "data/"

# df1 = pd.read_csv(os.path.join(PATH, 'df1.csv'), header=None)[[0,1,2,3,4,5]]
# df2 = pd.read_csv(os.path.join(PATH, 'df5.csv'), header=None)[[0,1,2,3,4,5]]

# # Run pipeline and see statistics
# pipeline = MyClass(df1, df2, matchColumn=0, on=[1,2,3,4,5], method="column", threshold = 0.6) #  --> this means at least 3/5 of the fields must match 
# pipeline.setGroundTruth()
# pipeline.soundexDfs()


# size = 1000

# df1 = pd.concat([df1[:size], df1[25000:25000 + size * 3]])
# df2 = pd.concat([df2[:size], df2[25000:25000 + size * 3]])

df1 = df1.apply(lambda x: (x[0] ,''.join(map(str, x[1:]))), axis=1).to_numpy()
df2 = df2.apply(lambda x: (x[0] ,''.join(map(str, x[1:]))), axis=1).to_numpy()

id_df1 = np.array([row[0] for row in df1])
id_df2 = np.array([row[0] for row in df2])

ground_truth_ids = np.intersect1d(id_df1, id_df2)

df2_buckets = defaultdict(list)
for row in df2:
    df2_buckets[row[1]].append(row[0])

start_time = time.time()

for value, key in df1:
    row1 = tuple(key[i:i+4] for i in range(0, len(key), 4))

    for data in df2_buckets:
        row2 = tuple(data[i:i+4] for i in range(0, len(data), 4))        
        if is_similar(row1, row2, 3):
            df2_buckets[data].append(value)  
            break
            
elapsed_time = time.time() - start_time


matched = set()
fp = 0

for row in df2_buckets:
    bucket = df2_buckets.get(row, [])

    if len(bucket) > 1:
        for id_ in set(bucket):
            if id_ in ground_truth_ids:
                matched.add(id_)
            else:
                fp += 1

tp = sum(1 for x in matched if x in ground_truth_ids)
fn = len(ground_truth_ids) - tp

precision = tp / (tp + fp) if tp + fp > 0 else 0
recall = tp / (tp + fn) if tp + fn > 0 else 0

print(f"True Positives: {tp}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Elapsed Time: {elapsed_time:.2f} seconds")

KeyboardInterrupt: 

In [None]:
True Positives: 330
False Positives: 50
False Negatives: 170
Precision: 0.8684
Recall: 0.6600
Elapsed Time: 26.34 seconds

In [330]:
df2

Unnamed: 0,id,col1,col2,col3,col4,col5
0,NEW69170,Z631,O057,E811,N890,V745
1,NEW42857,E710,E567,W738,P812,P543
2,NEW45364,G339,I476,R716,Q734,I953
3,NEW41107,T308,U581,E472,I891,R545
4,ID00900,F666,S903,W195,C685,N842
...,...,...,...,...,...,...
995,ID00579,A833,I881,X216,R850,P441
996,NEW13679,F611,Z437,W343,X130,A533
997,NEW16936,F577,R193,K928,O417,S138
998,NEW93404,O081,X493,C222,V901,C981


In [361]:
def is_similar(row1, row2, threshold=3):
    return np.sum(row1[1:] == row2[1:]) >= threshold

def process_chunk(chunk_row_pairs, df1_dict, df2_dict, ground_truth, threshold=3):
    total_matches = []
    for id1, id2 in chunk_row_pairs:
        row1 = df1_dict.get(id1)
        row2 = df2_dict.get(id2)
        if row1 is not None and row2 is not None:
            if is_similar(row1, row2, threshold):
                total_matches.append((id1, id2))

    tp = sum(1 for id1, id2 in total_matches if id1 == id2 and id1 in ground_truth)
    fp = len(total_matches) - tp
    
    return tp, fp
    
# Preprocess
size = 100
# df1 = pd.concat([pipeline.df1.iloc[:size], pipeline.df1.iloc[25000:25000 + size * 3]]).to_numpy()
# df2 = pd.concat([pipeline.df2.iloc[:size], pipeline.df2.iloc[25000:25000 + size * 3]]).to_numpy()

df1_dict = {row[0]: row for row in df1}
df2_dict = {row[0]: row for row in df2}
ground_truth = set(np.intersect1d(df1[:, 0], df2[:, 0]))

chunk_size = 500
total_tp = total_fp = total_fn = 0

start_time = time.time()

for i in range(0, len(df1), chunk_size):
    chunk_ids = df1[i:i + chunk_size, 0]
    chunk_row_pairs = list(product(chunk_ids, df2[:, 0]))
    tp, fp = process_chunk(chunk_row_pairs, df1_dict, df2_dict, ground_truth)
    total_tp += tp
    total_fp += fp

total_fn = len(ground_truth) - total_tp

precision = total_tp / (total_tp + total_fp) if total_tp + total_fp > 0 else 0
recall = total_tp / (total_tp + total_fn) if total_tp + total_fn > 0 else 0

elapsed_time = time.time() - start_time


print(f"True Positives: {total_tp}")
print(f"False Positives: {total_fp}")
print(f"False Negatives: {total_fn}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Elapsed Time: {elapsed_time:.2f} seconds")

InvalidIndexError: (slice(None, None, None), 0)

Expected True Matches  : 2
Expected False Negatives: 1
Actual Matching IDs     : 3 / 10 (30.00%)
df1 shape: (10, 6), df2 shape: (10, 6)
        id  col1  col2  col3  col4  col5
0  ID00000  W737  O083  L781  U807  N750
1  ID00001  M203  N237  K327  H388  E033
2  ID00002  D293  X070  F503  N259  Z907
3  ID00003  Q838  H798  L724  D046  V178
4  ID00004  C905  P332  I506  R882  X144
5  ID00005  I101  S497  X475  J147  E626
6  ID00006  O656  F322  Q652  W307  P146
7  ID00007  L865  S662  Q602  C806  M518
8  ID00008  Y277  C751  G014  O671  F017
9  ID00009  S498  C664  R896  F864  N665
         id  col1  col2  col3  col4  col5
0  NEW88641  L092  P218  O812  X312  G985
1  NEW43709  R050  M302  I676  K509  B771
2  NEW46619  W314  D085  D943  U371  U807
3  NEW14239  R772  F547  O593  E888  H875
4  NEW44149  B484  N080  S323  L901  Q864
5  NEW18412  J462  P631  F149  C803  B834
6  NEW73466  M203  N237  B551  H388  M671
7   ID00009  S498  L278  R896  A974  N665
8   ID00005  M226  L650  Y159  U660

In [2]:
def is_similar(row1, row2, threshold=3):
    return np.sum(row1[1:] == row2[1:]) >= threshold

from packages.generateDataSets import SyntheticMatcherDataset

dataset = SyntheticMatcherDataset(size=100, match_ratio=0.25, false_positive_ratio=0.15, false_negative_ratio=0.10, threshold=3)

# Access data
df1, df2 = dataset.get_dataframes()
ground_truth = dataset.get_ground_truth()
false_negatives = dataset.get_false_negatives()

# Show stats
dataset.stats()

# See samples
print(df1)
print(df2)

df1 = df1.apply(lambda x: (x[0] ,''.join(map(str, x[1:]))), axis=1).to_numpy()
df2 = df2.apply(lambda x: (x[0] ,''.join(map(str, x[1:]))), axis=1).to_numpy()

id_df1 = np.array([row[0] for row in df1])
id_df2 = np.array([row[0] for row in df2])

ground_truth_ids = np.intersect1d(id_df1, id_df2)

df2_buckets = defaultdict(list)
for row in df2:
    df2_buckets[row[1]].append(row[0])

start_time = time.time()

for value, key in df1:
    row1 = tuple(key[i:i + 4] for i in range(0, len(key), 4))

    for data in df2_buckets:
        row2 = tuple(data[i:i+4] for i in range(0, len(data), 4))        
        if is_similar(row1, row2, 3):
            df2_buckets[data].append(value)  
            break
            
elapsed_time = time.time() - start_time


matched = set()
fp = 0

for row in df2_buckets:
    bucket = df2_buckets.get(row, [])

    if len(bucket) > 1:
        for id_ in set(bucket):
            if id_ in ground_truth_ids:
                matched.add(id_)
            else:
                fp += 1

tp = sum(1 for x in matched if x in ground_truth_ids)
fn = len(ground_truth_ids) - tp

precision = tp / (tp + fp) if tp + fp > 0 else 0
recall = tp / (tp + fn) if tp + fn > 0 else 0

print(f"True Positives: {tp}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Elapsed Time: {elapsed_time:.2f} seconds")

0.15 15 10 25
Expected True Matches  : 25


AttributeError: 'SyntheticMatcherDataset' object has no attribute 'false_positives'

In [13]:
df1

array([('ID00000', 'Z133G643A830G186O673'),
       ('ID00001', 'G502L756G011A005B975'),
       ('ID00002', 'R918K658T053L029T391'),
       ('ID00003', 'K087J532O236K885X553'),
       ('ID00004', 'W468L599S556R260Q984'),
       ('ID00005', 'H759T398I774Y355J200'),
       ('ID00006', 'F234G707J068Z108R795'),
       ('ID00007', 'P511U416E038U038O839'),
       ('ID00008', 'U430U878O634Z158H676'),
       ('ID00009', 'X922J947T824H971O196'),
       ('ID00010', 'A983Z087B822T631U879'),
       ('ID00011', 'W911Y522J464B848C439'),
       ('ID00012', 'J110L844K601H338B439'),
       ('ID00013', 'O754B943R158C714M019'),
       ('ID00014', 'U926R329F415L384I766'),
       ('ID00015', 'E177L593O595B274T521'),
       ('ID00016', 'Q711R242D421A134V409'),
       ('ID00017', 'T832U842O425G080O585'),
       ('ID00018', 'Q575A014E340A908Y478'),
       ('ID00019', 'B845S177B740I018G926'),
       ('ID00020', 'Q895W534K072U530P074'),
       ('ID00021', 'R954Z921U060P396I317'),
       ('ID00022', 'U560Q119Z861