<div align="center"> 
    <hr>
  <font size="6"><b>Dev | Clean-Clean Entity Resolution Notebook</b> </font>
   <hr>
</div>

In [3]:
!python --version

Python 3.9.16


In [4]:
import os
import sys
import pandas as pd
import networkx
from networkx import draw, Graph
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from pyjedai.utils import (
    text_cleaning_method,
    print_clusters,
    print_blocks,
    print_candidate_pairs
)
from pyjedai.evaluation import Evaluation, write
from pyjedai.datamodel import Data
d1 = pd.read_csv("../data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str)
d2 = pd.read_csv("../data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str)
gt = pd.read_csv("../data/ccer/D2/gt.csv", sep='|', engine='python')

data = Data(
    dataset_1=d2,
    attributes_1=['id','name','description'],
    id_column_name_1='id',
    dataset_2=d1,
    attributes_2=['id','name','description'],
    id_column_name_2='id',
    ground_truth=gt,
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Block Building

In [5]:
from pyjedai.block_building import (
    StandardBlocking,
    QGramsBlocking,
    ExtendedQGramsBlocking,
    SuffixArraysBlocking,
    ExtendedSuffixArraysBlocking,
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
qgb = QGramsBlocking(qgrams=6)
blocks = qgb.build_blocks(data, attributes_1=['name'], attributes_2=['name'])

Q-Grams Blocking: 100%|██████████| 2152/2152 [00:00<00:00, 26082.82it/s]


In [7]:
qgb.evaluate(blocks, with_classification_report=True)

***************************************************************************************************************************
                                         Μethod:  Q-Grams Blocking
***************************************************************************************************************************
Method name: Q-Grams Blocking
Parameters: 
	Q-Gramms: 6
Runtime: 0.0835 seconds
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Performance:
	Precision:      0.05% 
	Recall:        15.89%
	F1-score:       0.10%
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Classification report:
	True positives: 171
	False positives: 338647
	True negatives: 818224
	False negatives: 905
	Total comparisons: 338818
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


{'Precision %': 0.05046957363540308,
 'Recall %': 15.892193308550187,
 'F1 %': 0.10061960493565641,
 'True Positives': 171,
 'False Positives': 338647,
 'True Negatives': 818224,
 'False Negatives': 905}

# Block Cleaning

In [8]:
from pyjedai.block_cleaning import BlockFiltering

In [9]:
bf = BlockFiltering(ratio=0.8)
filtered_blocks = bf.process(blocks, data, tqdm_disable=False)

Block Filtering: 100%|██████████| 3/3 [00:00<00:00, 139.01it/s]


# Comparison Cleaning


In [10]:
from pyjedai.block_cleaning import BlockPurging

In [11]:
cbbp = BlockPurging()
cleaned_blocks = cbbp.process(filtered_blocks, data, tqdm_disable=False)

Block Purging: 100%|██████████| 4938/4938 [00:00<00:00, 709953.49it/s]


In [12]:
cbbp.evaluate(cleaned_blocks, with_classification_report=True)

***************************************************************************************************************************
                                         Μethod:  Block Purging
***************************************************************************************************************************
Method name: Block Purging
Parameters: 
	Smoothing factor: 1.025
	Max Comparisons per Block: 4104.0
Runtime: 0.0081 seconds
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Performance:
	Precision:      0.06% 
	Recall:         6.60%
	F1-score:       0.11%
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Classification report:
	True positives: 71
	False positives: 128734
	True negatives: 1028037
	False negatives: 1005
	Total comparisons: 128805
──────────────────────────────────────────────────────────────────────────────────────────

{'Precision %': 0.055122083770039985,
 'Recall %': 6.598513011152416,
 'F1 %': 0.10933084900793803,
 'True Positives': 71,
 'False Positives': 128734,
 'True Negatives': 1028037,
 'False Negatives': 1005}

# Progressive Entity Matching

Applies Entity Matching to a subset of candidate pairs based on prioritization scheme.

In [44]:
from pyjedai.prioritization import (
    GlobalTopPM,
    LocalTopPM,
    EmbeddingsNNBPM,
    GlobalPSNM,
    LocalPSNM,
    RandomPM,
    PESM,
    WhooshPM
)

In [55]:
%%time
inorder_ltpm = LocalTopPM(
    weighting_scheme = 'X2',
    similarity_function='dice',
    similarity_threshold=0.1,
    attributes = ['description', 'name']
)

inorder_ltpm_pairs = inorder_ltpm.predict(data, cleaned_blocks, budget = 5000, algorithm='HB', indexing='inorder', tqdm_disable=True, emit_all_tps_stop=False)

CPU times: user 1.41 s, sys: 3.99 ms, total: 1.42 s
Wall time: 1.41 s


In [56]:
%%time
reverse_ltpm = LocalTopPM(
    weighting_scheme = 'X2',
    similarity_function='dice',
    similarity_threshold=0.1,
    attributes = ['description', 'name']
)

reverse_ltpm_pairs = reverse_ltpm.predict(data, cleaned_blocks, budget = 5000, algorithm='HB', indexing='reverse', tqdm_disable=True, emit_all_tps_stop=False)

CPU times: user 1.44 s, sys: 271 µs, total: 1.44 s
Wall time: 1.43 s


In [57]:
%%time
bilateral_ltpm = LocalTopPM(
    weighting_scheme = 'X2',
    similarity_function='dice',
    similarity_threshold=0.1,
    attributes = ['description', 'name']
)

bilateral_ltpm_pairs = bilateral_ltpm.predict(data, cleaned_blocks, budget = 5000, algorithm='HB', indexing='bilateral', tqdm_disable=True, emit_all_tps_stop=False)

CPU times: user 2.84 s, sys: 15.8 ms, total: 2.85 s
Wall time: 2.84 s


In [47]:
%%time
gtpm = GlobalTopPM(
    weighting_scheme = 'X2',
    similarity_function='dice',
    similarity_threshold=0.1,
    attributes = ['description', 'name']
)

gtpm_pairs_graph = gtpm.predict(data, cleaned_blocks, budget = 5000, algorithm='HB', tqdm_disable=True, emit_all_tps_stop=False)

TypeError: __init__() got an unexpected keyword argument 'w_scheme'

In [15]:
%%time
gpsnm = GlobalPSNM(
    weighting_scheme = 'ID',
    similarity_function='dice',
    similarity_threshold=0.1,
    attributes = ['description', 'name']
)

gpsnm_pairs_graph = gpsnm.predict(data, cleaned_blocks, budget = 5000, algorithm='HB', tqdm_disable=True, emit_all_tps_stop=False)

NameError: name 'GlobalPSNM' is not defined

In [16]:
%%time
lpsnm = LocalPSNM(
    weighting_scheme = 'ID',
    similarity_function='dice',
    similarity_threshold=0.1,
    attributes = ['description', 'name']
)

lpsnm_pairs_graph = lpsnm.predict(data, cleaned_blocks, budget = 5000, algorithm='HB', tqdm_disable=True, emit_all_tps_stop=False)

NameError: name 'LocalPSNM' is not defined

In [17]:
%%time
ennbpm = EmbeddingsNNBPM(
    language model = 'sminilm',
    similarity_search = 'faiss',
    similarity_function = 'dice',
    tokenizer = 'white_space_tokenizer',
    similarity_threshold = 0.1
)
ennbpm_pairs_graph = ennbpm.predict(data, cleaned_blocks, budget = 5000, algorithm='HB', tqdm_disable=True, emit_all_tps_stop=False)

NameError: name 'EmbeddingsNNBPM' is not defined

In [18]:
%%time
rpm = RandomPM(
    similarity_function='dice',
    similarity_threshold=0.1,
    attributes = ['description', 'name']
)

rpm_pairs_graph = rpm.predict(data, cleaned_blocks, budget = 5000, algorithm='HB', tqdm_disable=True, emit_all_tps_stop=False)

NameError: name 'RandomPM' is not defined

In [221]:
%%time
pesm = PESM(
    weighting_scheme = 'X2',
    similarity_function='dice',
    similarity_threshold=0.1,
    attributes = ['description', 'name']
)

pesm_pairs_graph = pesm.predict(data, cleaned_blocks, budget = 5000, algorithm='HB', tqdm_disable=True, emit_all_tps_stop=False)

CPU times: user 5.28 s, sys: 7.93 ms, total: 5.29 s
Wall time: 5.29 s


In [225]:
%%time
whpm = WhooshPM(
    similarity_function='TF-IDF',
    similarity_threshold=0.1,
    attributes = ['description', 'name']
)

whpm_graph = whpm.predict(data, cleaned_blocks, budget = 5000, algorithm='HB', tqdm_disable=True, emit_all_tps_stop=False)

Applying TF-IDF Similarity Function
CPU times: user 7.47 s, sys: 35.9 ms, total: 7.51 s
Wall time: 7.55 s


In [58]:
matchers_data = []

# matchers_data.append(("Random", rpm))
# matchers_data.append(("Local Top", ltpm))
# matchers_data.append(("Global Top", gtpm))
# matchers_data.append(("GSN", gpsnm))
# matchers_data.append(("ES-BFS", pesm))
# matchers_data.append(("ENNBPM-HB", ennbpm))
# matchers_data.append(("WHOOSH-BFS", whpm))
matchers_data.append(("Inorder LTPM", inorder_ltpm))
matchers_data.append(("Reverse LTPM", reverse_ltpm))
matchers_data.append(("Bilateral LTPM", bilateral_ltpm))


In [62]:
%%time
progressive_matchers_evaluator = Evaluation(data)
progressive_matchers_evaluator.evaluate_auc_roc(matchers_data = matchers_data, proportional = False)

2475.0000000000005
<class 'str'>
<class 'str'>


TypeError: '<' not supported between instances of 'str' and 'int'

In [15]:
# After supplying the progressive matchers to AUC ROC evaluation
# You can easily retrieve information about the method's performance using corresponding functions
# You can get the total number of emissions, the final cumulative recall and the normalized AUC

# Example for Entity Scheduling with BFS Method:
print(f'Total Emissions: {pesm.get_total_emissions()}')
print(f'Cumulative Recall: {pesm.get_cumulative_recall()}')
print(f'Normalized AUC: {pesm.get_normalized_auc()}')

NameError: name 'pesm' is not defined

In [18]:
print(f'Total Emissions: {ennbpm.get_total_emissions()}')
print(f'Cumulative Recall: {ennbpm.get_cumulative_recall()}')
print(f'Normalized AUC: {ennbpm.get_normalized_auc()}')

Total Emissions: 106443
Cumulative Recall: 1.0
Normalized AUC: 0.8425875763167529


In [172]:
print(f'Total Emissions: {ennbpm.get_total_emissions()}')
print(f'Cumulative Recall: {ennbpm.get_cumulative_recall()}')
print(f'Normalized AUC: {ennbpm.get_normalized_auc()}')

Total Emissions: 106117
Cumulative Recall: 0.9990706319702602
Normalized AUC: 0.8208217202276312


In [6]:
from pyjedai.workflow import ProgressiveWorkFlow

SyntaxError: invalid syntax (workflow.py, line 362)