<div align="center"> 
    <hr>
  <font size="6"><b>Dev | Dirty Entity Resolution Notebook</b> </font>
   <hr>
</div>

In [1]:
import os
import sys
import pandas as pd
import networkx
from networkx import (
    draw,
    DiGraph,
    Graph,
)

%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from pyjedai.utils import print_clusters, print_blocks, print_candidate_pairs
from pyjedai.evaluation import Evaluation
from pyjedai.datamodel import Data

d1 = pd.read_csv("./data/cora/cora.csv", sep='|')
gt = pd.read_csv("./data/cora/cora_gt.csv", sep='|', header=None)
attr = ['Entity Id','author', 'title']
data = Data(
    dataset_1=d1,
    id_column_name_1='Entity Id',
    ground_truth=gt,
    attributes_1=attr
)

data.process()

### Block Building

In [6]:
from pyjedai.block_building import (
    StandardBlocking,
    QGramsBlocking,
    SuffixArraysBlocking,
    ExtendedSuffixArraysBlocking,
    ExtendedQGramsBlocking
)

In [7]:
blocks = StandardBlocking().build_blocks(data)

Standard Blocking:   0%|          | 0/1295 [00:00<?, ?it/s]

In [8]:
blocks = QGramsBlocking(
    qgrams=2
).build_blocks(data)

Q-Grams Blocking:   0%|          | 0/1295 [00:00<?, ?it/s]

In [9]:
blocks = SuffixArraysBlocking(
    suffix_length=2
).build_blocks(data)

Suffix Arrays Blocking:   0%|          | 0/1295 [00:00<?, ?it/s]

In [10]:
blocks = ExtendedSuffixArraysBlocking(
    suffix_length=2
).build_blocks(data)

Extended Suffix Arrays Blocking:   0%|          | 0/1295 [00:00<?, ?it/s]

In [11]:
Evaluation(data).report(blocks)

#  Evaluation 
---
Scores:
	Precision:      1.75% 
	Recall:        48.14%
	F1-score:       3.39%
Classification report:
	True positives: 8273
	False positives: 463295
	True negatives: 365659
	False negatives: 8911
	Total comparisons: 471568
---


#### EmbeddingsNNBlockBuilding

In [12]:
from pyjedai.vector_based_blocking import EmbeddingsNNBlockBuilding

emb = EmbeddingsNNBlockBuilding(
    vectorizer='bert',
    similarity_search='faiss'
)
blocks = emb.build_blocks(data)

Embeddings-NN Block Building:   0%|          | 0/1295 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
Evaluation(data).report(blocks, emb.method_configuration())

### Block Cleaning

In [12]:
from pyjedai.block_cleaning import (
    BlockFiltering
)

In [13]:
filtered_blocks = BlockFiltering(
    ratio=0.9
).process(blocks, data)

Block Filtering:   0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
Evaluation(data).report(filtered_blocks)

#  Evaluation 
---
Scores:
	Precision:      2.09% 
	Recall:        46.45%
	F1-score:       4.00%
Classification report:
	True positives: 7982
	False positives: 374206
	True negatives: 454457
	False negatives: 9202
	Total comparisons: 382188
---


### Comparison Cleaning - Meta Blocking

In [15]:
from pyjedai.block_cleaning import (
    BlockPurging
)

In [16]:
cleaned_blocks = BlockPurging(
    smoothing_factor=0.008
).process(blocks, data)

Block Purging:   0%|          | 0/8088 [00:00<?, ?it/s]

In [17]:
Evaluation(data).report(cleaned_blocks)

#  Evaluation 
---
Scores:
	Precision:      2.65% 
	Recall:         0.52%
	F1-score:       0.87%
Classification report:
	True positives: 89
	False positives: 3272
	True negatives: 817498
	False negatives: 17095
	Total comparisons: 3361
---


In [18]:
from pyjedai.comparison_cleaning import (
    WeightedEdgePruning,
    WeightedNodePruning,
    CardinalityEdgePruning,
    CardinalityNodePruning,
    BLAST,
    ReciprocalCardinalityNodePruning,
    # ReciprocalCardinalityWeightPruning,
    ComparisonPropagation
)

In [19]:
candidate_pairs_blocks = WeightedEdgePruning(
    weighting_scheme='CBS'
).process(filtered_blocks, data)

Weighted Edge Pruning:   0%|          | 0/1295 [00:00<?, ?it/s]

In [20]:
candidate_pairs_blocks = WeightedNodePruning(
    weighting_scheme='CBS'
).process(filtered_blocks, data)

# In one case valid entities set is empty and crushed / what to do in this case, Java doesnt handle it

Weighted Node Pruning:   0%|          | 0/1295 [00:00<?, ?it/s]

In [21]:
candidate_pairs_blocks = CardinalityEdgePruning(
    weighting_scheme='CBS'
).process(filtered_blocks, data)

# In one case valid entities set is empty and crushed / what to do in this case, Java doesnt handle it

Cardinality Edge Pruning:   0%|          | 0/1295 [00:00<?, ?it/s]

In [22]:
candidate_pairs_blocks = CardinalityNodePruning(
    weighting_scheme='JS'
).process(filtered_blocks, data)

# In one case valid entities set is empty and crushed / what to do in this case, Java doesnt handle it

Cardinality Node Pruning:   0%|          | 0/1295 [00:00<?, ?it/s]

In [23]:
candidate_pairs_blocks = BLAST(
    weighting_scheme='JS'
).process(filtered_blocks, data)

# In one case valid entities set is empty and crushed / what to do in this case, Java doesnt handle it

BLAST:   0%|          | 0/1295 [00:00<?, ?it/s]

In [24]:
candidate_pairs_blocks = ReciprocalCardinalityNodePruning(
    weighting_scheme='JS'
).process(filtered_blocks, data)

# In one case valid entities set is empty and crushed / what to do in this case, Java doesnt handle it

Reciprocal Cardinality Node Pruning:   0%|          | 0/1295 [00:00<?, ?it/s]

In [25]:
candidate_pairs_blocks = ComparisonPropagation().process(blocks, data)

# In one case valid entities set is empty and crushed / what to do in this case, Java doesnt handle it

Comparison Propagation:   0%|          | 0/1295 [00:00<?, ?it/s]

In [26]:
Evaluation(data).report(candidate_pairs_blocks)

#  Evaluation 
---
Scores:
	Precision:     14.80% 
	Recall:        48.14%
	F1-score:      22.64%
Classification report:
	True positives: 8273
	False positives: 47632
	True negatives: 781322
	False negatives: 8911
	Total comparisons: 55905
---


### Entity Matching

In [27]:
from pyjedai.matching import EntityMatching

In [28]:
attr = ['author', 'title']
# or with weights
attr = {
    'author' : 0.6,
    'title' : 0.4
}

EM = EntityMatching(
    metric='jaccard', 
    similarity_threshold=0.5
    # embedings=None, # gensim
    # attributes=attr,
    # qgram=2 # for ngram metric or jaccard
)

# pairs_graph = EM.predict(blocks, data)

In [None]:
pairs_graph = EM.predict(filtered_blocks, data)

Entity Matching (jaccard):   0%|          | 0/3986 [00:00<?, ?it/s]

In [None]:
attr = {
    'author' : 0.6, 
    'title' : 0.4
}

EM = EntityMatching(
    metric='jaccard', 
    similarity_threshold=0.5
    # embedings=None, # gensim
    # attributes=attr,
    # qgram=2 # for ngram metric or jaccard
)

pairs_graph = EM.predict(candidate_pairs_blocks, data)

In [None]:
draw(pairs_graph)

In [None]:
e = Evaluation(data)
e.report(pairs_graph)

### Entity Clustering

In [None]:
from pyjedai.clustering import ConnectedComponentsClustering

In [None]:
clusters = ConnectedComponentsClustering().process(pairs_graph)

In [None]:
e = Evaluation(data)
e.report(pairs_graph)

In [None]:
e.confusion_matrix()

# WorkFlow 2

### Similarity Joins

In [None]:
from pyjedai.joins import SchemaAgnosticΕJoin, TopKSchemaAgnosticJoin

In [None]:
g = SchemaAgnosticΕJoin(
    threshold = 0.5,
    metric = 'jaccard',
    tokenization = 'qgrams_multiset',
    qgrams = 2
).fit(data)

In [None]:
e = Evaluation(data)
e.report(g)

In [None]:
g = TopKSchemaAgnosticJoin(
    K=20,
    metric = 'jaccard',
    tokenization = 'qgrams',
    qgrams = 3
).fit(data)

In [None]:
draw(g)

In [None]:
e = Evaluation(data)
e.report(g)

### Entity Clustering

In [None]:
from pyjedai.clustering import ConnectedComponentsClustering

In [None]:
clusters = ConnectedComponentsClustering().process(g)

In [None]:
e = Evaluation(data)
e.report(clusters)

In [None]:
e.confusion_matrix()