<div align="center"> 
    <hr>
  <font size="6"><b>Dev | Supervised with DL and pyTorch Embeddings ER Notebook</b> </font>
   <hr>
</div>

In [24]:
import os
import sys
import pandas as pd
import networkx
from networkx import draw, Graph
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
from pyjedai.utils import (
    text_cleaning_method,
    print_clusters,
    print_blocks,
    print_candidate_pairs
)
from pyjedai.evaluation import Evaluation, write
from pyjedai.datamodel import Data
d1 = pd.read_csv("./data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str)
d2 = pd.read_csv("./data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str)
gt = pd.read_csv("./data/ccer/D2/gt.csv", sep='|', engine='python')

data = Data(
    dataset_1=d1,
    id_column_name_1='id',
    dataset_2=d2,
    id_column_name_2='id',
    ground_truth=gt,
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
data.entities

Unnamed: 0,id,name,description,price
0,0,Sony Turntable - PSLX350H,Sony Turntable - PSLX350H/ Belt Drive System/ ...,
1,1,Bose Acoustimass 5 Series III Speaker System -...,Bose Acoustimass 5 Series III Speaker System -...,399
2,2,Sony Switcher - SBV40S,Sony Switcher - SBV40S/ Eliminates Disconnecti...,49
3,3,Sony 5 Disc CD Player - CDPCE375,Sony 5 Disc CD Player- CDPCE375/ 5 Disc Change...,
4,4,Bose 27028 161 Bookshelf Pair Speakers In Whit...,Bose 161 Bookshelf Speakers In White - 161WH/ ...,158
...,...,...,...,...
2147,1071,Sony VAIO FW378J/B Notebook - VGNFW378J/B,Intel Centrino 2 Core 2 Duo P8600 2.4GHz - 16....,
2148,1072,Sennheiser CX380 Sennheiser CX 380 Sport II Gr...,,
2149,1073,IWORK 09 RETAIL-INT DVD - MB942Z/A,,
2150,1074,IWORK 09 FAMILY PACK-INT DVD - MB943Z/A,,


# Block Building

In [3]:
from pyjedai.block_building import (
    StandardBlocking,
    QGramsBlocking,
    ExtendedQGramsBlocking,
    SuffixArraysBlocking,
    ExtendedSuffixArraysBlocking,
)

from pyjedai.vector_based_blocking import EmbeddingsNNBlockBuilding

  from tqdm.autonotebook import tqdm


In [4]:
qgb = StandardBlocking()
blocks = qgb.build_blocks(data)

Standard Blocking:   0%|          | 0/2152 [00:00<?, ?it/s]

In [5]:
qgb.report()

Method name: Standard Blocking
Method info: Creates one block for every token in the attribute values of at least two entities.
Parameters: Parameter-Free method
Attributes from D1:
	id, name, description, price
Attributes from D2:
	id, name, description, price
Runtime: 0.3155 seconds


In [8]:
qgb.evaluate(blocks)

***************************************************************************************************************************
                                         Μethod:  Standard Blocking
***************************************************************************************************************************
Method name: Standard Blocking
Parameters: 
Runtime: 0.3155 seconds
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Performance:
	Precision:      0.12% 
	Recall:        99.81%
	F1-score:       0.24%
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


{'Precision %': 0.11989350241963842,
 'Recall %': 99.81412639405205,
 'F1 %': 0.2394993259900253,
 'True Positives': 1074,
 'False Positives': 894721,
 'True Negatives': 263053,
 'False Negatives': 2}

# Comparison Cleaning


In [9]:
from pyjedai.block_cleaning import BlockPurging

In [10]:
cbbp = BlockPurging()
cleaned_blocks = cbbp.process(blocks, data, tqdm_disable=False)

Block Purging:   0%|          | 0/4264 [00:00<?, ?it/s]

In [11]:
cbbp.report()

Method name: Block Purging
Method info: Discards the blocks exceeding a certain number of comparisons.
Parameters: 
	Smoothing factor: 1.025
	Max Comparisons per Block: 13920.0
Runtime: 0.0473 seconds


In [13]:
cbbp.evaluate(cleaned_blocks)

***************************************************************************************************************************
                                         Μethod:  Block Purging
***************************************************************************************************************************
Method name: Block Purging
Parameters: 
	Smoothing factor: 1.025
	Max Comparisons per Block: 13920.0
Runtime: 0.0473 seconds
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Performance:
	Precision:      0.24% 
	Recall:        99.81%
	F1-score:       0.48%
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


{'Precision %': 0.24257043350995353,
 'Recall %': 99.81412639405205,
 'F1 %': 0.4839647255505437,
 'True Positives': 1074,
 'False Positives': 441684,
 'True Negatives': 716090,
 'False Negatives': 2}

## Meta Blocking

In [14]:
from pyjedai.comparison_cleaning import (
    WeightedEdgePruning,
    WeightedNodePruning,
    CardinalityEdgePruning,
    CardinalityNodePruning,
    BLAST,
    ReciprocalCardinalityNodePruning,
    ReciprocalWeightedNodePruning,
    ComparisonPropagation
)

In [15]:
wep = CardinalityEdgePruning(weighting_scheme='X2')
candidate_pairs_blocks = wep.process(cleaned_blocks, data, tqdm_disable=True)

In [16]:
wep.evaluate(candidate_pairs_blocks, wep.method_configuration())

***************************************************************************************************************************
                                         Μethod:  Cardinality Edge Pruning
***************************************************************************************************************************
Method name: Cardinality Edge Pruning
Parameters: 
	Node centric: False
	Weighting scheme: X2
Runtime: 13.6502 seconds
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Performance:
	Precision:      5.81% 
	Recall:        96.19%
	F1-score:      10.96%
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


Unnamed: 0,Precision %,Recall %,F1 %,True Positives,False Positives,True Negatives,False Negatives
0,5.81,96.19,10.96,1035.0,16782.0,1140000.0,41.0


In [34]:
# print(candidate_pairs_blocks)

# Supervised DL method for matching based on the candidate pairs

Candidate pairs are in form (dictionary): 

> entity_id_1 -> set(entity_id_x, entity_id_y, entity_id_z)

- __The split process:__ Given a ratio (0.6 + 0.2 + 0.2 = 1.0) all candiadtes will be splited into train, test, validation accordingly, by concatinating each entity attribute into a lowercase string. For example 60% of comparisons will be used for training, 20% for testing and 20% for validation by using pyTorch.
- After the concatenation each string entity will be transformed into an embedding using pre-trained word and sentence embeddings.
- The next step will be to create the vocabulary and afterwords train the model. The model will produce a vector of similarities ?? and this vector will be used to determine if a pair of comparison is a match or not

## Data preparation for train, test, validation

In [22]:
!pip install tensorboardX
!pip install pytorch_transformers

Collecting pytorch_transformers
  Using cached pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
Collecting sacremoses
  Using cached sacremoses-0.0.53.tar.gz (880 kB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py): started
  Building wheel for sacremoses (setup.py): finished with status 'done'
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=73c9d1f2468adc2ef8ee2142873050778bc8a35ed1d5844c6c64eed4cdcff22a
  Stored in directory: c:\users\nikol\appdata\local\pip\cache\wheels\87\39\dd\a83eeef36d0bf98e7a4d1933a4ad2d660295a40613079bafc9
Successfully built sacremoses
Installing collected packages: sacremoses, pytorch-transformers
Successfully installed pytorch-transformers-1.2.0 sacremoses-0.0.53


In [23]:
from pyjedai.supervised_techniques import PretrainedSupervisedER

## Pytorch using BERT-like models and pre-trained word and sentence embeddings



__Available models:__
- 'roberta-base' 
- 'bert-base-uncased' 
- 'distilbert-base-uncased' 
- 'sentence-transformers/all-distilroberta-v1' 
- 'sentence-transformers/all-MiniLM-L12-v2' 
- 'albert-base-v2' 
- 'sentence-transformers/all-mpnet-base-v2' 
- 'xlnet-base-cased'

__Available model types:__ 
- 'roberta' 
- 'bert' 
- 'distilbert' 
- 'sdistilroberta'
- 'sminilm'
- 'albert' 
- 'smpnet'
- 'xlnet'

In [None]:
pser = PretrainedSupervisedER(model_type='roberta', model_name='roberta-base')

In [None]:
prediction, true_labels = pser.fit_blocks(blocks, data)

In [None]:
pser.evaluate(prediction, true_labels)