<div align="center"> 
    <hr>
  <font size="6"><b>Dev | Supervised with DL and pyTorch Embeddings ER Notebook</b> </font>
   <hr>
</div>

In [None]:
import os
import sys
import pandas as pd
import networkx
from networkx import draw, Graph
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from pyjedai.utils import (
    text_cleaning_method,
    print_clusters,
    print_blocks,
    print_candidate_pairs
)
from pyjedai.evaluation import Evaluation, write
from pyjedai.datamodel import Data
d1 = pd.read_csv("./data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str)
d2 = pd.read_csv("./data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str)
gt = pd.read_csv("./data/ccer/D2/gt.csv", sep='|', engine='python')

data = Data(
    dataset_1=d1,
    attributes_1=['id','name','description'],
    id_column_name_1='id',
    dataset_2=d2,
    attributes_2=['id','name','description'],
    id_column_name_2='id',
    ground_truth=gt,
)

# Block Building

In [9]:
from pyjedai.block_building import (
    StandardBlocking,
    QGramsBlocking,
    ExtendedQGramsBlocking,
    SuffixArraysBlocking,
    ExtendedSuffixArraysBlocking,
)

from pyjedai.vector_based_blocking import EmbeddingsNNBlockBuilding

  from tqdm.autonotebook import tqdm


In [10]:
qgb = SuffixArraysBlocking()
blocks = qgb.build_blocks(data, attributes_1=['name'], attributes_2=['name'])

Suffix Arrays Blocking:   0%|          | 0/2152 [00:00<?, ?it/s]

In [5]:
qgb.report()

Method name: Suffix Arrays Blocking
Method info: Creates one block for every suffix that appears in the attribute value tokens of at least two entities.
Parameters: 
	Suffix length: 6
	Maximum Block Size: 53
Attributes from D1:
	name
Attributes from D2:
	name
Runtime: 0.1585 seconds


In [6]:
e = Evaluation(data)
e.report(blocks, qgb.method_configuration())

# Suffix Arrays Blocking Evaluation 
---
Method name: Suffix Arrays Blocking
Parameters: 
	Suffix length: 6
	Maximum Block Size: 53
Runtime: 0.1585 seconds
Scores:
	Precision:      1.41% 
	Recall:        97.03%
	F1-score:       2.78%
Classification report:
	True positives: 1044
	False positives: 73021
	True negatives: 1084723
	False negatives: 32
	Total comparisons: 74065
---


# Comparison Cleaning


In [10]:
from pyjedai.block_cleaning import BlockPurging

In [11]:
cbbp = BlockPurging()
cleaned_blocks = cbbp.process(filtered_blocks, data, tqdm_disable=False)

Block Purging:   0%|          | 0/5820 [00:00<?, ?it/s]

In [12]:
cbbp.report()

Method name: Block Purging
Method info: Discards the blocks exceeding a certain number of comparisons.
Parameters: 
	Smoothing factor: 1.025
	Max Comparisons per Block: 420.0
Runtime: 0.0492 seconds


In [13]:
e = Evaluation(data)
e.report(cleaned_blocks, cbbp.method_configuration())

# Block Purging Evaluation 
---
Method name: Block Purging
Parameters: 
	Smoothing factor: 1.025
	Max Comparisons per Block: 420.0
Runtime: 0.0492 seconds
Scores:
	Precision:      2.80% 
	Recall:        94.52%
	F1-score:       5.45%
---


## Meta Blocking

In [14]:
from pyjedai.comparison_cleaning import (
    WeightedEdgePruning,
    WeightedNodePruning,
    CardinalityEdgePruning,
    CardinalityNodePruning,
    BLAST,
    ReciprocalCardinalityNodePruning,
    ReciprocalWeightedNodePruning,
    ComparisonPropagation
)

In [15]:
wep = CardinalityEdgePruning(weighting_scheme='X2')
candidate_pairs_blocks = wep.process(filtered_blocks, data, tqdm_disable=True)

In [16]:
Evaluation(data).report(candidate_pairs_blocks, wep.method_configuration())

# Cardinality Edge Pruning Evaluation 
---
Method name: Cardinality Edge Pruning
Parameters: 
	Node centric: False
	Weighting scheme: X2
Runtime: 1.1039 seconds
Scores:
	Precision:     13.43% 
	Recall:        89.13%
	F1-score:      23.35%
---


# Supervised DL method for matching based on the candidate pairs

## Data preparation for train, test, validation

## Pytorch using BERT-like models and pre-trained word and sentence embeddings