## Imports & Data Module Definition

Here you can define:


*   Dataset Paths
*   Ground Truth Path
*   Attributes to keep from each dataset
*   ID Column names

In [2]:
import os
import sys
import pandas as pd
import networkx
from networkx import draw, Graph
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from pyjedai.utils import (
    text_cleaning_method,
    print_clusters,
    print_blocks,
    print_candidate_pairs
)
from pyjedai.evaluation import Evaluation, write
from pyjedai.datamodel import Data

D1_PATH = "data/test/ccer/abt_100.csv"
D2_PATH = "data/test/ccer/buy_100.csv"
GT_PATH = "data/test/ccer/gt_100.csv"

D1_ATTRS = ['id','name','description'] 
D1_ID_COL = 'id'
D2_ATTRS = ['id','name','description'] 
D2_ID_COL = 'id'

d1 = pd.read_csv(D1_PATH, sep='|', engine='python', na_filter=False).astype(str)
d2 = pd.read_csv(D2_PATH, sep='|', engine='python', na_filter=False).astype(str)
gt = pd.read_csv(GT_PATH, sep='|', engine='python')

data = Data(
    dataset_1=d1,
    attributes_1=D1_ATTRS,
    id_column_name_1=D1_ID_COL,
    dataset_2=d2,
    attributes_2=D2_ATTRS,
    id_column_name_2=D2_ID_COL,
    ground_truth=gt,
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## BLOCK CONSTRUCTION

In [3]:
from pyjedai.block_building import (
    StandardBlocking,
    QGramsBlocking,
    ExtendedQGramsBlocking,
    SuffixArraysBlocking,
    ExtendedSuffixArraysBlocking,
)

qgb = QGramsBlocking(qgrams=6)
blocks = qgb.build_blocks(data, attributes_1=['name'], attributes_2=['name'])
qgb.evaluate(blocks, with_classification_report=True)

Q-Grams Blocking:   0%|          | 0/200 [00:00<?, ?it/s]

***************************************************************************************************************************
                                         Μethod:  Q-Grams Blocking
***************************************************************************************************************************
Method name: Q-Grams Blocking
Parameters: 
	Q-Gramms: 6
Runtime: 0.0338 seconds
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Performance:
	Precision:      1.78% 
	Recall:       100.00%
	F1-score:       3.51%
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Classification report:
	True positives: 49
	False positives: 2697
	True negatives: 7303
	False negatives: 0
	Total comparisons: 2746
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


{'Precision %': 1.7844136926438456,
 'Recall %': 100.0,
 'F1 %': 3.5062611806797856,
 'True Positives': 49,
 'False Positives': 2697,
 'True Negatives': 7303,
 'False Negatives': 0}

## RUN THE BELOW IF YOU WANT TO LOWER EXECUTION TIME
Following methods may:
* Lower the amount of blocks
* Lower the cardinality of blocks

This will result in smaller search space, but will lower the number of true positives that will finally be emitted!

In [4]:
from pyjedai.block_cleaning import BlockFiltering

bf = BlockFiltering(ratio=0.8)
filtered_blocks = bf.process(blocks, data, tqdm_disable=False)

Block Filtering:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
from pyjedai.block_cleaning import BlockPurging

cbbp = BlockPurging()
cleaned_blocks = cbbp.process(filtered_blocks, data, tqdm_disable=False)
cbbp.evaluate(cleaned_blocks, with_classification_report=True)

Block Purging:   0%|          | 0/358 [00:00<?, ?it/s]

***************************************************************************************************************************
                                         Μethod:  Block Purging
***************************************************************************************************************************
Method name: Block Purging
Parameters: 
	Smoothing factor: 1.025
	Max Comparisons per Block: 22.0
Runtime: 0.0087 seconds
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Performance:
	Precision:      7.33% 
	Recall:        85.71%
	F1-score:      13.50%
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Classification report:
	True positives: 42
	False positives: 531
	True negatives: 9462
	False negatives: 7
	Total comparisons: 573
────────────────────────────────────────────────────────────────────────────────────────────────────────

{'Precision %': 7.329842931937172,
 'Recall %': 85.71428571428571,
 'F1 %': 13.504823151125404,
 'True Positives': 42,
 'False Positives': 531,
 'True Negatives': 9462,
 'False Negatives': 7}

## ADJUST THE BELOW GLOBAL VARIABLES ACCORDINGLY:

In [6]:
# the total number of pairs that will be emitted
# Final amount may be lower or higher (duplicate emissions have been purged / upper bound for neighborhood budget is taken)
BUDGET = 5000
# When this option is enabled, the context of budget changes
# In most algorithms, budget is ignored
# BUT IN THE CASE OF VECTOR BASED PER - If Below True -> Budget = Size of neighborhood per entity
STOP_EMISSION_WHEN_ALL_TRUE_POSITIVES_FOUND = False
vectorizers = ['word2vec', 'fasttext', 'doc2vec', 'glove', 'bert', 'distilbert', 'roberta', 'xlnet', 'albert', 'smpnet', 'st5', 'sent_glove', 'sdistilroberta', 'sminilm']

## Hybrid Approach

For each vectorizer, we produce the corresponding embeddings.
For each entity, we produce candidate pairs using faiss.
We emit the candidate pairs following the hybrid approach.
Specifically, we sort the entities in descending order of their neighborhood average weight and we emit the top comparison for each entity.
Subsequently, we iterate over all the remaining candidates for current pair in descending order of their similarity score. We move to the next entity, till all have been examined. 

In [7]:
METHOD = 'HB'

hb_matchers = {}

for vectorizer in vectorizers:
  hb_matchers[vectorizer] = EmbeddingsNNBPM(
      budget = BUDGET,
      vectorizer = vectorizer,
      similarity_search = 'faiss'
  )

  print(f"Vectorizer: {vectorizer}")
  hb_matchers[vectorizer].predict(cleaned_blocks, data, tqdm_disable=True, method=METHOD, emit_all_tps_stop=STOP_EMISSION_WHEN_ALL_TRUE_POSITIVES_FOUND)


# Matchers and their vectorizer names are being stored (Vectorizer name will be used as matcher's name in the ROC graph)
hb_matchers_data = []

for vectorizer, matcher in hb_matchers.items():
  hb_matchers_data.append((matcher, vectorizer))

evaluator = Evaluation(data)
# evaluator calculates cumulative recall for each emission, final normalized AUC and displays the ROC graph
evaluator.evaluate_auc_roc(matchers_data = hb_matchers_data, proportional = False)

NameError: name 'EmbeddingsNNBPM' is not defined

## DFS Approach

For each vectorizer, we produce the corresponding embeddings.
For each entity, we produce candidate pairs using faiss.
We emit the candidate pairs following the DFS approach.
Specifically, we sort the entities in descending order of their neighborhood average weight. We iterate over the sorted entities and for each one we emit all its candidates in descending order of their similarity score, before moving to the next entity's neighborhood.

In [None]:
METHOD = 'DFS'

dfs_matchers = {}

for vectorizer in vectorizers:
  dfs_matchers[vectorizer] = EmbeddingsNNBPM(
      budget = BUDGET,
      vectorizer = vectorizer,
      similarity_search = 'faiss'
  )

  print(f"Vectorizer: {vectorizer}")
  dfs_matchers[vectorizer].predict(cleaned_blocks, data, tqdm_disable=True, method=METHOD, emit_all_tps_stop=STOP_EMISSION_WHEN_ALL_TRUE_POSITIVES_FOUND)


# Matchers and their vectorizer names are being stored (Vectorizer name will be used as matcher's name in the ROC graph)
dfs_matchers_data = []

for vectorizer, matcher in dfs_matchers.items():
  dfs_matchers_data.append((matcher, vectorizer))

evaluator = Evaluation(data)
# evaluator calculates cumulative recall for each emission, final normalized AUC and displays the ROC graph
evaluator.evaluate_auc_roc(matchers_data = dfs_matchers_data, proportional = False)

## BFS Approach

For each vectorizer, we produce the corresponding embeddings.
For each entity, we produce candidate pairs using faiss.
We emit the candidate pairs following the DFS approach.
Specifically, we sort the entities in descending order of their neighborhood average weight. We iterate over the sorted entities and for each one we emit its currently top candidate (it won't be considered again). We move to the next entity and we keep iterating over them, till there are no neighbors for any entity.

In [None]:
METHOD = 'BFS'

bfs_matchers = {}

for vectorizer in vectorizers:
  bfs_matchers[vectorizer] = EmbeddingsNNBPM(
      budget = BUDGET,
      vectorizer = vectorizer,
      similarity_search = 'faiss'
  )

  print(f"Vectorizer: {vectorizer}")
  bfs_matchers[vectorizer].predict(cleaned_blocks, data, tqdm_disable=True, method=METHOD, emit_all_tps_stop=STOP_EMISSION_WHEN_ALL_TRUE_POSITIVES_FOUND)


# Matchers and their vectorizer names are being stored (Vectorizer name will be used as matcher's name in the ROC graph)
bfs_matchers_data = []

for vectorizer, matcher in bfs_matchers.items():
  bfs_matchers_data.append((matcher, vectorizer))

evaluator = Evaluation(data)
# evaluator calculates cumulative recall for each emission, final normalized AUC and displays the ROC graph
evaluator.evaluate_auc_roc(matchers_data = bfs_matchers_data, proportional = False)