In [1]:
!pip install jellyfish
!pip install faker



In [1]:
import pyspark
import jellyfish
import pandas as pd
import numpy as np
from typing import List
import os
import math
from itertools import combinations, product
import time
from concurrent.futures import ProcessPoolExecutor
from collections import defaultdict
from packages.generateDataSets import SyntheticMatcherDataset
from packages.calculateStatistics import DatasetEvaluator

In [2]:
class MyClass:
    def __init__(self, df1: pd.DataFrame, 
                 df2: pd.DataFrame, 
                 matchColumn: str, 
                 on: List = [],
                 method: str = 'column', 
                 threshold: float  = 0.6):
        self.df1 = df1
        self.df2 = df2
        self.on = on
        self.threshold = threshold

        if method not in ["concat", "column"]:
            raise ValueError(f"Method '{method}' is not correct.")
        self.method = method

    
        if matchColumn not in self.df1.columns or matchColumn not in self.df2.columns:
            raise ValueError(f"Column '{matchColumn}' is not found in both DataFrames.")
        self.matchColumn = matchColumn
        
        self.groundTruth = None
        self.totalMatches = None        
    
    def setGroundTruth(self):
        """Sets the ground truth based on matching 'id' columns."""
        self.groundTruth = np.intersect1d(self.df1[self.matchColumn], self.df2[self.matchColumn])

    def soundexDfs(self):
        """Apply soundex transformation to non-id columns."""
        for df in [self.df1, self.df2]:
            for col_name in df.columns:
                if col_name != self.matchColumn:
                    df[col_name] = df[col_name].apply(lambda x: jellyfish.soundex(str(x)))

            if self.method == 'concat':
                non_match_columns = [col for col in df.columns if col != self.matchColumn]
                df['concatenated'] = df[non_match_columns].apply(lambda row: ''.join(row.astype(str)), axis=1)
                df.drop(columns=non_match_columns, inplace=True)

    def setTotalMatches(self):
        """Sets the total matches based on merged DataFrames."""
        
        # if self.method == 'concat':
        #     self.totalMatches = self.df1.merge(self.df2, how="inner", on=['concatenated']).to_numpy()
        # else:   
        #     self.totalMatches = self.df1.merge(self.df2, how="outer", on=self.on + [self.matchColumn]).to_numpy()

        self.totalMatches =  self.df1.merge(pd.concat([self.df1, self.df2]), how='outer', on=self.on)[["0_y"] + self.on]
        self.totalMatches.rename(columns={'0_y': self.matchColumn}, inplace=True)
        
    def printStatistics(self):
        """Print statistics (True Positives, False Positives, Precision)."""
        myStatistics = self.Statistics(groundTruth=self.groundTruth, 
                                       totalMatches=self.totalMatches, 
                                       threshold=self.threshold, 
                                       on=self.on, 
                                       matchColumn=self.matchColumn)
        myStatistics.calculate()

    # Inner class Statistics
    class Statistics:
        def __init__(self,
                     groundTruth: pd.DataFrame, 
                     totalMatches: pd.DataFrame, 
                     threshold : float = 0.8,
                     matchColumn: str | int = 1,
                     on: List =[]):
            self.groundTruth = pd.DataFrame(groundTruth)
            self.totalMatches = pd.DataFrame(totalMatches)
            self.threshold = threshold
            self.matchColumn = matchColumn
            self.on = on

            self._setThresholdValues()
            
        def calculate(self):
            # self.result = self.totalMatches.groupby(self.matchColumn)\
            #         .filter(lambda x : len(x) >=2)\
            #         .groupby(self.matchColumn)\
            #         .apply(lambda x: x.iloc[:, 1:].apply(lambda x: x.nunique() == 1)).sum(axis=1)

            duplicates = self.totalMatches[self.totalMatches[[1,2,3,4,5]].duplicated(keep=False)].sort_values(by=[1,2,3,4,5])
        
            # Function to check if two rows match at least 3/5 columns
            def is_duplicate(row1, row2):
                return sum(row1 == row2) >=  self.matchingRows  # At least 3 matches out of 5

            # print(duplicates)
            
            duplicate_pairs = []
            # Find duplicates
            for i in range(len(duplicates)):
                for j in range(i + 1, len(duplicates)):  # Compare only unique pairs
                    if is_duplicate(duplicates.iloc[i, 1:], duplicates.iloc[j, 1:]):
                        duplicate_pairs.append((i, j, duplicates.iloc[i, 0] == duplicates.iloc[j, 0]))  # Store (index1, index2, same_id)

            # Count same ID and different ID duplicates
            tp = sum(1 for _, _, same_id in duplicate_pairs if same_id)
            fp = len(duplicate_pairs) - same_id_count
            fn = self.groundTruth.size - tp
            
            precision = tp / (tp + fp) if tp + fp != 0 else 0 
            recall = tp / (tp + fn)  if tp + fn != 0 else 0
            f1_score = (2 * precision * recall) / (precision + recall)
            
            print("Total Possible Mathces:", self.groundTruth.size)
            print("True Positives (TP):", tp)
            print("False Positives (FP):", fp)
            print("False Negatives (FN):", fn)
            print("Precision:", f"{precision:.4f}")
            print("Recall:", f"{recall:.4f}")
            print("F1-score:", f"{f1_score:.4f}")

        def _matchingAlgorithm(self, group):
            return group.nunique() == 1
            
        def _setThresholdValues(self) -> List:
            size = len(self.totalMatches.columns) - 1
            limit = math.floor(self.threshold * size)
            
            print(f"We accept at least {limit}/{size} as matches!") 
            self.matchingRows = limit
            # return [i for i in range(size, limit , -1)]
            

In [3]:
PATH  =  "data/"

df3 = pd.read_csv(os.path.join(PATH, 'df1.csv'), header=None)[[0,1,2,3,4,5]]
df4 = pd.read_csv(os.path.join(PATH, 'df2.csv'), header=None)[[0,1,2,3,4,5]]

# Run pipeline and see statistics
pipeline = MyClass(df3, df4, matchColumn=0, on=[1,2,3,4,5], method="column", threshold = 0.6) #  --> this means at least 3/5 of the fields must match 
pipeline.setGroundTruth()
pipeline.soundexDfs()

In [4]:
evaluator = DatasetEvaluator(pipeline.df1, pipeline.df2, threshold=3)
evaluator.evaluate()
evaluator.printResults()

Expected: {}
Ground Truth Size: 25000
True Positives: 19036
False Positives: 17830
False Negatives: 5964
Precision: 0.5164
Recall: 0.7614
Elapsed Time: 33682.16 seconds


In [8]:
1 - 33682.16/81137.75

0.5848768298356806

In [None]:



# df1 -> df2
thresh = 3
Expected: {}
Ground Truth Size: 25000
True Positives: 19036
False Positives: 17830
False Negatives: 5964
Precision: 0.5164
Recall: 0.7614
Elapsed Time: 81137.75 seconds


# df1 -> df5
thresh = 2
Expected: {}
Ground Truth Size: 25000
True Positives: 24977
False Positives: 74055
False Negatives: 23
Precision: 0.2522
Recall: 0.9991
Elapsed Time: 4861.54 seconds

# df1 -> df5
thresh = 3
Expected: {}
Ground Truth Size: 25000
True Positives: 19125
False Positives: 16323
False Negatives: 5875
Precision: 0.5395
Recall: 0.7650
Elapsed Time: 80256.47 seconds

In [1]:
import pyspark
import jellyfish
import pandas as pd
import numpy as np
from typing import List
import os
import math
from itertools import combinations, product
import time
from concurrent.futures import ProcessPoolExecutor
from collections import defaultdict
from packages.generateDataSets import SyntheticMatcherDataset
from packages.calculateStatistics import DatasetEvaluator

In [68]:
# Create two datasets with slight variations
# Data have 3 matches and one 
data1 = {
    0: [101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    1: ["Kostas", "Maria", "John", "Sophia", "George", "Eleni", "Michael", "Anna", "Chris", "Dimitris"],
    2: ["Razgkelis", "Papadopoulos", "Smith", "Johnson", "Pavlou", "Nikolaou", "Brown", "Miller", "Taylor", "Andreas"],
    3: ["Orestiada", "Thessaloniki", "Grevena", "Athina", "Aleksandroupoli", "Giannena", "Larissa", "Komotini", "Trikala", "Kozani"],
    4: ['Jennifer Lights', 'Brandon Lakes', 'Aguilar Stravenue', 'Richardson Ferry', 'Freeman Way', 
        'Gabrielle Underpass', 'Burns Summit', 'Heather Village', 'Jamie Common', 'Greg Lock'],
    5:  ['Cooper and Sons', 'Pope LLC', 'Fowler-Smith', 'Torres PLC', 'Jones LLC', 'White, Duncan and Robinson', 'Hayden Inc', 
         'Wilson and Sons', 'Peterson, Smith and Robinson','Hudson, Phelps and Day'],
    
}

data2 = {
    0: [101, 202, 203, 204, 205, 206, 207, 208, 209, 110],
    1: ["Kistas", "Maria", "John", "Sophasdia", "Giorge", "Elendsi", "Micheal", "Ana", "Khris", "Dimtris"],
    2: ["Rozgkliiis", "Papadopoulos", "Smith", "Johnson", "Pavlodvu", "Nikolaou", "Batrrroun", "antMiler", "Tttayloor", "Andres"],
    3: ["Orestiada", "Thessaloniki", "Grevena", "Athina", "Aleksandrouasdpoli", "Gianasdna", "Larasdissa", "Komasdini", "Trsadala", "Koxani"],
    4: ["Jnnfer Lights", 'Brandon Lakes', 'Aguilar Stravenue', 'RichardasasdaFerry', 'Freasdeman Way', 'Gabrielle Underpass', 'Burasdas mmit', 'Heatasasdllage', 'JamasdCommon', 'Grg Lck'],
    5:  ['Cpeeer and ons', 'Pope LLC', 'Fowler-Smith', 'Torvasd PLC', 'Jonasda LLC', 'Whitasddvuncan and Robinson', 'Hayasdasv Inc', 
         'Wasdvand Sons', 'Petersosdvaith and Robinson','Htsn, Phelps and Day'],
}




In [114]:

# Convert to DataFrame
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Run pipeline and see statistics
pipeline = MyClass(df1, df2, matchColumn=0, on=[1, 2, 3, 4, 5], threshold=0.4)
pipeline.setGroundTruth()
pipeline.soundexDfs()
pipeline.setTotalMatches()
# pipeline.printStatistics()

In [2]:
PATH  =  "data/"

df1 = pd.read_csv(os.path.join(PATH, 'df1.csv'), header=None)[[0,1,2,3,4,5]]
df2 = pd.read_csv(os.path.join(PATH, 'df2.csv'), header=None)[[0,1,2,3,4,5]]

# Run pipeline and see statistics
pipeline = MyClass(df1, df2, matchColumn=0, on=[1,2,3,4,5], method="column", threshold = 0.6) #  --> this means at least 3/5 of the fields must match 
pipeline.setGroundTruth()
pipeline.soundexDfs()

evaluator = DatasetEvaluator(pipeline.df1, pipeline.df2, expected, threshold=3)
evaluator.evaluate()
evaluator.printResults()

NameError: name 'pd' is not defined

NameError: name 'expected' is not defined

In [6]:
80256.47/60/60

22.293463888888887

In [19]:
df3, df4 = pipeline.df1, pipeline.df2

In [20]:
dasd = pd.concat([df1,df2, df3]).drop_duplicates().reset_index(drop=True)

In [21]:
dasd

Unnamed: 0,0,1,2,3,4,5
0,AA100000,B323,J520,A620,3523,G650
1,AA100004,B620,M240,E421,2251,B645
2,AA100006,B620,D520,W460,3345,H616
3,AA100007,K260,K600,L500,1225,G650
4,AA100008,K536,J600,F652,4235,B645
...,...,...,...,...,...,...
295962,AK132571,B260,C530,A500,7214,O252
295963,AK132572,K520,L200,R352,4215,S316
295964,AK132573,K400,J520,K400,4562,S316
295965,AK132574,H200,J200,M300,6563,A200


In [None]:
df1_proc = df3.apply(lambda x: (x[self.match_column], ''.join(map(str, x[1:]))), axis=1).to_numpy()
df2_proc = df4.apply(lambda x: (x[self.match_column], ''.join(map(str, x[1:]))), axis=1).to_numpy()

id_df1 = np.array([row[0] for row in df1_proc])
id_df2 = np.array([row[0] for row in df2_proc])

# Build bucket of df2 rows grouped by values
df2_buckets = defaultdict(list)
for row in df2_proc:
    df2_buckets[row[1]].append(row[0])


In [26]:

start_time = time.time()

df1_proc = test = df3.apply(lambda x: (x[0], ''.join(map(str, x[1:]))), axis=1).to_numpy()
print(time.time() - start_time)

start_time = time.time()
id_df1 = np.array([row[0] for row in df1_proc])
print(time.time() - start_time)

5.222691774368286
0.019913434982299805


In [25]:
test

array([('AA100000', 'B325J120A6203523G650'),
       ('AA100004', 'B620M240E4232251B645'),
       ('AA100006', 'B632D500W4603345H616'), ...,
       ('AK132573', 'K400J520K4004562S316'),
       ('AK132574', 'H200J200M3006563A200'),
       ('AK132575', 'C615M600M6262452S140')], dtype=object)

In [28]:
df2_buckets = defaultdict(list)
for row in df1_proc:
    df2_buckets[row[1]].append(row[0])

In [29]:
df2_buckets

defaultdict(list,
            {'B325J120A6203523G650': ['AA100000'],
             'B620M240E4232251B645': ['AA100004'],
             'B632D500W4603345H616': ['AA100006'],
             'K226K600L5201256G650': ['AA100007'],
             'K536J656F6244235B645': ['AA100008'],
             'B621J100F6522221G650': ['AA100009'],
             'K413R540L5005252B645': ['AA100013'],
             'L521K623L5002525B645': ['AA100014'],
             'B420B640J5003435E450': ['AA100015'],
             'W300K623K5205241B642': ['AA100016'],
             'K620T520C4233416B645': ['AA100017'],
             'K520D500K5308521G650': ['AA100019'],
             'L125R543J5206262G125': ['AA100027'],
             'L500W425C1003141H160': ['AA100029'],
             'L510F600L0002253B645': ['AA100032'],
             'M550J520H6001134B264': ['AA100034'],
             'M216P362N2005220G610': ['AA100038'],
             'M256J220E3262124H616': ['AA100039'],
             'M255B140N0003632G650': ['AA100043'],
             