<div align="center"> 
    <hr>
  <font size="6"><b>Dev | Supervised with DL and pyTorch Embeddings ER Notebook</b> </font>
   <hr>
</div>

In [8]:
import os
import sys
import pandas as pd
import networkx
from networkx import draw, Graph
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from pyjedai.utils import (
    text_cleaning_method,
    print_clusters,
    print_blocks,
    print_candidate_pairs
)
from pyjedai.evaluation import Evaluation, write
from pyjedai.datamodel import Data
d1 = pd.read_csv("./data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str)
d2 = pd.read_csv("./data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str)
gt = pd.read_csv("./data/ccer/D2/gt.csv", sep='|', engine='python')

data = Data(
    dataset_1=d1,
    attributes_1=['id','name','description'],
    id_column_name_1='id',
    dataset_2=d2,
    attributes_2=['id','name','description'],
    id_column_name_2='id',
    ground_truth=gt,
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
data.entities

Unnamed: 0,id,name,description,price
0,0,Sony Turntable - PSLX350H,Sony Turntable - PSLX350H/ Belt Drive System/ ...,
1,1,Bose Acoustimass 5 Series III Speaker System -...,Bose Acoustimass 5 Series III Speaker System -...,399
2,2,Sony Switcher - SBV40S,Sony Switcher - SBV40S/ Eliminates Disconnecti...,49
3,3,Sony 5 Disc CD Player - CDPCE375,Sony 5 Disc CD Player- CDPCE375/ 5 Disc Change...,
4,4,Bose 27028 161 Bookshelf Pair Speakers In Whit...,Bose 161 Bookshelf Speakers In White - 161WH/ ...,158
...,...,...,...,...
2147,1071,Sony VAIO FW378J/B Notebook - VGNFW378J/B,Intel Centrino 2 Core 2 Duo P8600 2.4GHz - 16....,
2148,1072,Sennheiser CX380 Sennheiser CX 380 Sport II Gr...,,
2149,1073,IWORK 09 RETAIL-INT DVD - MB942Z/A,,
2150,1074,IWORK 09 FAMILY PACK-INT DVD - MB943Z/A,,


# Block Building

In [12]:
from pyjedai.block_building import (
    StandardBlocking,
    QGramsBlocking,
    ExtendedQGramsBlocking,
    SuffixArraysBlocking,
    ExtendedSuffixArraysBlocking,
)

from pyjedai.vector_based_blocking import EmbeddingsNNBlockBuilding

  from tqdm.autonotebook import tqdm


In [13]:
qgb = StandardBlocking()
blocks = qgb.build_blocks(data, attributes_1=['name'], attributes_2=['name'])

Standard Blocking:   0%|          | 0/2152 [00:00<?, ?it/s]

In [14]:
qgb.report()

Method name: Standard Blocking
Method info: Creates one block for every token in the attribute values of at least two entities.
Parameters: Parameter-Free method
Attributes from D1:
	name
Attributes from D2:
	name
Runtime: 0.2429 seconds


In [5]:
e = Evaluation(data)
e.report(blocks, qgb.method_configuration())

 Suffix Arrays Blocking Evaluation 
---
Method name: Suffix Arrays Blocking
Parameters: 
	Suffix length: 6
	Maximum Block Size: 53
Runtime: 0.2636 seconds
Scores:
	Precision:      1.41% 
	Recall:        97.03%
	F1-score:       2.78%
---


# Comparison Cleaning


In [6]:
from pyjedai.block_cleaning import BlockPurging

In [8]:
cbbp = BlockPurging()
cleaned_blocks = cbbp.process(blocks, data, tqdm_disable=False)

Block Purging:   0%|          | 0/5908 [00:00<?, ?it/s]

In [9]:
cbbp.report()

Method name: Block Purging
Method info: Discards the blocks exceeding a certain number of comparisons.
Parameters: 
	Smoothing factor: 1.025
	Max Comparisons per Block: 702.0
Runtime: 0.0715 seconds


In [10]:
e = Evaluation(data)
e.report(cleaned_blocks, cbbp.method_configuration())

 Block Purging Evaluation 
---
Method name: Block Purging
Parameters: 
	Smoothing factor: 1.025
	Max Comparisons per Block: 702.0
Runtime: 0.0715 seconds
Scores:
	Precision:      1.41% 
	Recall:        97.03%
	F1-score:       2.78%
---


## Meta Blocking

In [11]:
from pyjedai.comparison_cleaning import (
    WeightedEdgePruning,
    WeightedNodePruning,
    CardinalityEdgePruning,
    CardinalityNodePruning,
    BLAST,
    ReciprocalCardinalityNodePruning,
    ReciprocalWeightedNodePruning,
    ComparisonPropagation
)

In [13]:
wep = CardinalityEdgePruning(weighting_scheme='X2')
candidate_pairs_blocks = wep.process(cleaned_blocks, data, tqdm_disable=True)

In [14]:
Evaluation(data).report(candidate_pairs_blocks, wep.method_configuration())

 Cardinality Edge Pruning Evaluation 
---
Method name: Cardinality Edge Pruning
Parameters: 
	Node centric: False
	Weighting scheme: X2
Runtime: 2.1133 seconds
Scores:
	Precision:     10.93% 
	Recall:        91.45%
	F1-score:      19.53%
---


In [17]:
# print(candidate_pairs_blocks)

# Supervised DL method for matching based on the candidate pairs

Candidate pairs are in form (dictionary): 

> entity_id_1 -> set(entity_id_x, entity_id_y, entity_id_z)

- __The split process:__ Given a ratio (0.6 + 0.2 + 0.2 = 1.0) all candiadtes will be splited into train, test, validation accordingly, by concatinating each entity attribute into a lowercase string. For example 60% of comparisons will be used for training, 20% for testing and 20% for validation by using pyTorch.
- After the concatenation each string entity will be transformed into an embedding using pre-trained word and sentence embeddings.
- The next step will be to create the vocabulary and afterwords train the model. The model will produce a vector of similarities ?? and this vector will be used to determine if a pair of comparison is a match or not

## Data preparation for train, test, validation

In [21]:
!pip install tensorboardX

Collecting tensorboardX
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
     ------------------------------------ 125.4/125.4 kB 816.4 kB/s eta 0:00:00
Collecting protobuf<=3.20.1,>=3.8.0
  Downloading protobuf-3.20.1-cp39-cp39-win_amd64.whl (904 kB)
     -------------------------------------- 904.1/904.1 kB 1.5 MB/s eta 0:00:00
Installing collected packages: protobuf, tensorboardX
Successfully installed protobuf-3.20.1 tensorboardX-2.5.1


In [24]:
from pyjedai.supervised_techniques import PretrainedSupervisedER

## Pytorch using BERT-like models and pre-trained word and sentence embeddings

In [33]:
pser = PretrainedSupervisedER(model_type='roberta', model_name='roberta-base')

In [49]:
pser.fit(blocks, data)

Training with 2 labels: ['0', '1']
Initialized roberta-model


TypeError: expected str, bytes or os.PathLike object, not NoneType

In [39]:
pser.entities_d1

AttributeError: 'PretrainedSupervisedER' object has no attribute 'entities_d1'