<div align="center"> 
    <hr>
  <font size="6"><b>Dev | Supervised with DL and pyTorch Embeddings ER Notebook</b> </font>
   <hr>
</div>

In [21]:
import os
import sys
import pandas as pd
import networkx
from networkx import draw, Graph
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from pyjedai.utils import (
    text_cleaning_method,
    print_clusters,
    print_blocks,
    print_candidate_pairs
)
from pyjedai.evaluation import Evaluation, write
from pyjedai.datamodel import Data
d1 = pd.read_csv("./data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str)
d2 = pd.read_csv("./data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str)
gt = pd.read_csv("./data/ccer/D2/gt.csv", sep='|', engine='python')

data = Data(
    dataset_1=d1,
    id_column_name_1='id',
    dataset_2=d2,
    id_column_name_2='id',
    ground_truth=gt,
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
data.entities

Unnamed: 0,id,name,description,price
0,0,Sony Turntable - PSLX350H,Sony Turntable - PSLX350H/ Belt Drive System/ ...,
1,1,Bose Acoustimass 5 Series III Speaker System -...,Bose Acoustimass 5 Series III Speaker System -...,399
2,2,Sony Switcher - SBV40S,Sony Switcher - SBV40S/ Eliminates Disconnecti...,49
3,3,Sony 5 Disc CD Player - CDPCE375,Sony 5 Disc CD Player- CDPCE375/ 5 Disc Change...,
4,4,Bose 27028 161 Bookshelf Pair Speakers In Whit...,Bose 161 Bookshelf Speakers In White - 161WH/ ...,158
...,...,...,...,...
2147,1071,Sony VAIO FW378J/B Notebook - VGNFW378J/B,Intel Centrino 2 Core 2 Duo P8600 2.4GHz - 16....,
2148,1072,Sennheiser CX380 Sennheiser CX 380 Sport II Gr...,,
2149,1073,IWORK 09 RETAIL-INT DVD - MB942Z/A,,
2150,1074,IWORK 09 FAMILY PACK-INT DVD - MB943Z/A,,


# Block Building

In [23]:
from pyjedai.block_building import (
    StandardBlocking,
    QGramsBlocking,
    ExtendedQGramsBlocking,
    SuffixArraysBlocking,
    ExtendedSuffixArraysBlocking,
)

from pyjedai.vector_based_blocking import EmbeddingsNNBlockBuilding

In [24]:
qgb = StandardBlocking()
blocks = qgb.build_blocks(data)

Standard Blocking:   0%|          | 0/2152 [00:00<?, ?it/s]

In [25]:
qgb.report()

Method name: Standard Blocking
Method info: Creates one block for every token in the attribute values of at least two entities.
Parameters: Parameter-Free method
Attributes from D1:
	id, name, description, price
Attributes from D2:
	id, name, description, price
Runtime: 0.3812 seconds


In [26]:
e = Evaluation(data)
e.report(blocks, qgb.method_configuration())

 Standard Blocking Evaluation 
---
Method name: Standard Blocking
Parameters: 
Runtime: 0.3812 seconds
Scores:
	Precision:      0.12% 
	Recall:        99.81%
	F1-score:       0.24%
---


# Comparison Cleaning


In [27]:
from pyjedai.block_cleaning import BlockPurging

In [28]:
cbbp = BlockPurging()
cleaned_blocks = cbbp.process(blocks, data, tqdm_disable=False)

Block Purging:   0%|          | 0/4264 [00:00<?, ?it/s]

In [29]:
cbbp.report()

Method name: Block Purging
Method info: Discards the blocks exceeding a certain number of comparisons.
Parameters: 
	Smoothing factor: 1.025
	Max Comparisons per Block: 13920.0
Runtime: 0.0991 seconds


In [30]:
e = Evaluation(data)
e.report(cleaned_blocks, cbbp.method_configuration())

 Block Purging Evaluation 
---
Method name: Block Purging
Parameters: 
	Smoothing factor: 1.025
	Max Comparisons per Block: 13920.0
Runtime: 0.0991 seconds
Scores:
	Precision:      0.12% 
	Recall:        99.81%
	F1-score:       0.24%
---


## Meta Blocking

In [31]:
from pyjedai.comparison_cleaning import (
    WeightedEdgePruning,
    WeightedNodePruning,
    CardinalityEdgePruning,
    CardinalityNodePruning,
    BLAST,
    ReciprocalCardinalityNodePruning,
    ReciprocalWeightedNodePruning,
    ComparisonPropagation
)

In [32]:
wep = CardinalityEdgePruning(weighting_scheme='X2')
candidate_pairs_blocks = wep.process(cleaned_blocks, data, tqdm_disable=True)

In [33]:
Evaluation(data).report(candidate_pairs_blocks, wep.method_configuration())

 Cardinality Edge Pruning Evaluation 
---
Method name: Cardinality Edge Pruning
Parameters: 
	Node centric: False
	Weighting scheme: X2
Runtime: 62.4234 seconds
Scores:
	Precision:      5.05% 
	Recall:        96.28%
	F1-score:       9.60%
---


In [34]:
# print(candidate_pairs_blocks)

# Supervised DL method for matching based on the candidate pairs

Candidate pairs are in form (dictionary): 

> entity_id_1 -> set(entity_id_x, entity_id_y, entity_id_z)

- __The split process:__ Given a ratio (0.6 + 0.2 + 0.2 = 1.0) all candiadtes will be splited into train, test, validation accordingly, by concatinating each entity attribute into a lowercase string. For example 60% of comparisons will be used for training, 20% for testing and 20% for validation by using pyTorch.
- After the concatenation each string entity will be transformed into an embedding using pre-trained word and sentence embeddings.
- The next step will be to create the vocabulary and afterwords train the model. The model will produce a vector of similarities ?? and this vector will be used to determine if a pair of comparison is a match or not

## Data preparation for train, test, validation

In [35]:
!pip install tensorboardX



In [36]:
from pyjedai.supervised_techniques import PretrainedSupervisedER

## Pytorch using BERT-like models and pre-trained word and sentence embeddings

In [37]:
pser = PretrainedSupervisedER(model_type='roberta', model_name='roberta-base')

In [None]:
pser.fit(blocks, data)

Training with 2 labels: ['0', '1']
1074
1074
4621422
Ratio of true positives to true negatives:  0.00046479200557750407
Ratio of true positives to all pairs:  1.0
#pairs_count:  1074 
#diff_pairs:  1074
0  ->  644
0  ->  644
644  ->  858
644  ->  858
859  ->  1074
859  ->  1074
Total:  2146
Initialized roberta-model
Loaded 1288 training examples
Built optimizer: AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-08
    initial_lr: 2e-05
    lr: 2e-05
    weight_decay: 0.0

Parameter Group 1
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-08
    initial_lr: 2e-05
    lr: 2e-05
    weight_decay: 0.0
)
Loaded and initialized evaluation examples 430


Evaluating: 100%|████████████████████████████████████████████████████| 27/27 [07:50<00:00, 17.43s/it]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


***** Eval results after epoch -1 *****
eval_loss: 0.6967910793092515
f1_score: 0.6666666666666666
simple_accuracy: 0.5


Epoch:   0%|                                                                  | 0/15 [00:00<?, ?it/s]

Iteration:   1%|▋                                                   | 1/81 [01:22<1:50:29, 82.87s/it][A
Iteration:   2%|█▎                                                  | 2/81 [03:00<2:00:34, 91.57s/it][A
Iteration:   4%|█▉                                                  | 3/81 [04:19<1:51:21, 85.66s/it][A

In [None]:
from pyjedai.utils import print_blocks

print_blocks(blocks, False)

In [None]:
data.dataset_1

In [None]:
# print(data.ground_truth['D1']==741)
data.ground_truth.sort_values(by=['D1'])

In [None]:
data._ids_mapping_1