# Schema Pointer Demonstration

In [1]:
from models.pointer_net import PointerNet
from utils.preprocessing import WordEmbedding, load_word_emb
import numpy as np
import torch

import warnings
warnings.filterwarnings("ignore")

### Pre-trained GloVe embeddings and model architecture

In [2]:
# Initialize GloVe word embeddings
w2v_config = {
    'data_dir': 'data/glove',
    'word2idx_path': 'word2idx.json',
    'usedwordemb_path': 'usedwordemb.npy'
}
w2v = WordEmbedding(load_word_emb(w2v_config['data_dir'], 
                                          w2v_config['word2idx_path'],
                                          w2v_config['usedwordemb_path'])
                           )

# Network architecture
model_params = {
    'input_size': 300,
    'embedding_size': 300,
    'hiddens': 256,
    'nof_lstms': 2,
    'dropout': 0,
    'bidir': True
}

### Schema Pointer definition

$ 
x = \texttt{<BEG>} \cdot x^{s} \cdot \texttt{<SEP>} \cdot x^{t}_{1} \cdot x^{t}_2 \cdot ...x^{t}_{N} \cdot \texttt{<END>}
$

In [3]:
class SchemaPointer1D():
    
    def __init__(self):
        self.w2v = None
        self.model = None
        
    def __call__(self, source_col, target_schema, verbose=False):
        features, sequence_tok = self._preprocess(source_col, target_schema)
        _, pointers = self.model(features)
        try:
            prediction = sequence_tok[pointers[0][:pointers[0].argmax()]] # pointer until end token
        except:
            prediction = '<NONE>'
        if verbose:
            print("x = {}".format(';'.join(sequence_tok)))
            print("y = {}".format(prediction))
        return prediction
        
    def set_w2v(self, w2v):
        self.w2v = w2v
        
    def initialize(self, params, path='serialized/schema_pointer_sp.pt'):
        self.model = PointerNet(params['input_size'],
                   params['embedding_size'],
                   params['hiddens'],
                   params['nof_lstms'],
                   params['dropout'],
                   params['bidir'])
        self.model.initialize(path)
        self.model.eval() # inference phase       
    
    def _preprocess(self, source_col, target_schema):
        source_col = source_col.lower()
        target_schema = ';'.join(target_schema).lower()
        input_sequence = "<BEG>;{};<SEQ>;{};<END>".format(source_col, target_schema)
        embeddings = []
        input_sequence_tok = input_sequence.split(';')
        for token in input_sequence_tok:
            embedding = np.mean([w2v(word) for word in token], axis=0)
            embeddings.append(embedding)
        embeddings = torch.Tensor(embeddings)
        features = embeddings.unsqueeze(0)
        return features, input_sequence_tok
    
schema_pointer = SchemaPointer1D()
schema_pointer.set_w2v(w2v)
schema_pointer.initialize(model_params)

### Inference

In [6]:
# example: 
# source_col,input_cols,target_cols
# CA Winning Team,Date<|>Site<|>Sport<|>Winning team<|>Series,Winning team
source_col = 'CA Winning Team'
target_schema = ['Date', 'Site', 'Sport', 'Winning team', 'Series']
match = schema_pointer(source_col, target_schema, verbose=True)

print("-" * 100)
# Sundsvall,102.<|>28 February<|>Friendly<|>Croatia<|>Austria,
source_col = 'ice'
target_schema = ['102', '28 February', 'Friendly', 'Croatia', 'Austria']
match = schema_pointer(source_col, target_schema, verbose=True)

x = <BEG>;ca team;<SEQ>;date;site;sport;winning team;series;<END>
y = date
----------------------------------------------------------------------------------------------------
x = <BEG>;ice;<SEQ>;102;28 february;friendly;croatia;austria;<END>
y = <NONE>
