<div align="center"> 
    <hr>
  <font size="6"><b>Dev | Clean-Clean Entity Resolution Notebook</b> </font>
   <hr>
</div>

In [1]:
!python --version

Python 3.9.16


In [2]:
import os
import sys
import pandas as pd
import networkx
from networkx import draw, Graph
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from pyjedai.utils import (
    text_cleaning_method,
    print_clusters,
    print_blocks,
    print_candidate_pairs
)
from pyjedai.evaluation import Evaluation
from pyjedai.datamodel import Data
d1 = pd.read_csv("../data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str)
d2 = pd.read_csv("../data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str)

data = Data(
    dataset_1=d1,
    attributes_1=['id','name','description'],
    id_column_name_1='id',
    dataset_2=d2,
    attributes_2=['id','name','description'],
    id_column_name_2='id',
)

[nltk_data] Downloading package stopwords to /home/jm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Block Building

In [3]:
from pyjedai.block_building import (
    StandardBlocking,
    QGramsBlocking,
    ExtendedQGramsBlocking,
    SuffixArraysBlocking,
    ExtendedSuffixArraysBlocking,
)

from pyjedai.vector_based_blocking import EmbeddingsNNBlockBuilding

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
qgb = SuffixArraysBlocking()
blocks = qgb.build_blocks(data, attributes_1=['name'], attributes_2=['name'])

Suffix Arrays Blocking: 100%|██████████| 2152/2152 [00:00<00:00, 15218.71it/s]


In [5]:
qgb.report()

Method name: Suffix Arrays Blocking
Method info: Creates one block for every suffix that appears in the attribute value tokens of at least two entities.
Parameters: 
	Suffix length: 6
	Maximum Block Size: 53
Attributes from D1:
	name
Attributes from D2:
	name
Runtime: 0.1423 seconds


#### EmbeddingsNNBlockBuilding

In [6]:
emb = EmbeddingsNNBlockBuilding(
    vectorizer='bert',
    similarity_search='faiss'
)
blocks = emb.build_blocks(data)

Building blocks via Embeddings-NN Block Building [bert, faiss]


Embeddings-NN Block Building [bert, faiss]:   0%|          | 0/2152 [00:00<?, ?it/s]

Device selected:  cuda


Embeddings-NN Block Building [bert, faiss]:  51%|█████     | 1088/2152 [00:10<00:09, 114.36it/s]

Vector size:  (1076, 768)


Embeddings-NN Block Building [bert, faiss]: 100%|██████████| 2152/2152 [00:20<00:00, 106.21it/s]

Saving embeddings...
Saving file:  .embeddings/bert_d1.npy
Saving file:  .embeddings/bert_d2.npy





In [7]:
Evaluation(data).report(blocks, emb.method_configuration())

AttributeError: Can not proceed to evaluation without a ground-truth file. Data object has not been initialized with the ground-truth file

# Block Cleaning

In [None]:
from pyjedai.block_cleaning import BlockFiltering

In [None]:
bf = BlockFiltering(ratio=0.8)
filtered_blocks = bf.process(blocks, data, tqdm_disable=False)

# Comparison Cleaning


In [None]:
from pyjedai.block_cleaning import BlockPurging

In [None]:
cbbp = BlockPurging()
cleaned_blocks = cbbp.process(filtered_blocks, data, tqdm_disable=False)

In [None]:
cbbp.report()

## Meta Blocking

In [None]:
from pyjedai.comparison_cleaning import (
    WeightedEdgePruning,
    WeightedNodePruning,
    CardinalityEdgePruning,
    CardinalityNodePruning,
    BLAST,
    ReciprocalCardinalityNodePruning,
    ReciprocalWeightedNodePruning,
    ComparisonPropagation
)

In [None]:
wep = CardinalityEdgePruning(weighting_scheme='X2')
candidate_pairs_blocks = wep.process(filtered_blocks, data, tqdm_disable=True)

# Entity Matching

It compares pairs of entity profiles, associating every pair with a similarity in [0,1]. Its output comprises the similarity graph, i.e., an undirected, weighted graph where the nodes correspond to entities and the edges connect pairs of compared entities.

In [None]:
from pyjedai.matching import EntityMatching

In [None]:
EM = EntityMatching(
    metric='dice',
    similarity_threshold=0.4,
    attributes = ['description', 'name']
)

pairs_graph = EM.predict(candidate_pairs_blocks, data, tqdm_disable=True)

In [None]:
draw(pairs_graph)

# Entity Clustering

It takes as input the similarity graph produced by Entity Matching and partitions it into a set of equivalence clusters, with every cluster corresponding to a distinct real-world object.

In [None]:
from pyjedai.clustering import ConnectedComponentsClustering, UniqueMappingClustering

In [None]:
ccc = ConnectedComponentsClustering()
clusters = ccc.process(pairs_graph)

In [None]:
ccc.report()

In [None]:
from pyjedai.clustering import ConnectedComponentsClustering, UniqueMappingClustering
umc = UniqueMappingClustering()
clusters = umc.process(pairs_graph)