In [1]:
from torch.utils.data import Dataset
import torch
import torch.backends.cudnn as cudnn
from utils.preprocessing import WordEmbedding, load_word_emb
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset
from utils.metrics import accuracy, precision, recall
from models.pointer_net import PointerNet
from utils.datasets import SchemaMatchingDataset

import warnings
warnings.filterwarnings("ignore")  

In [2]:
params = {
    # GPU
    'gpu': True,
    # Network
    'input_size': 300,
    'embedding_size': 300,
    'hiddens': 256,
    'nof_lstms': 2,
    'dropout': 0.3,
    'bidir': True
}

In [3]:
dataset = SchemaMatchingDataset(None, from_path=True)
dataset.load('data/training')

In [4]:
for version in ['sp']: #['sp', 'ap', 'np']:
    combined_logs = []
    logs_1to0, logs_1to1, logs_1toN = [], [], []
    # np = no pretraining
    # ap = pretraining on alphabet sorting
    # sp = pretraining on 1to1 schema pointing
    model = PointerNet(params['input_size'],
                       params['embedding_size'],
                       params['hiddens'],
                       params['nof_lstms'],
                       params['dropout'],
                       params['bidir'])

    model.initialize('serialized/schema_pointer_{}.pt'.format(version))

    if params['gpu'] and torch.cuda.is_available():
        model.cuda()
        cudnn.benchmark = True

    num_samples = 100
    batch_size = 256
    for data in tqdm(dataset.yield_bootstrap(num_samples, batch_size), total=num_samples):
        try:
            inputs, targets = data
            if torch.cuda.is_available:
                inputs, targets = inputs.cuda(), targets.cuda()
            outputs, pointers = model(inputs)
            acc, rec, prec = accuracy(pointers, targets), recall(pointers, targets), precision(pointers, targets)
            log = {
                'accuracy': acc,
                'recall': rec,
                'precision': prec
            }
            combined_logs.append(log)
        except:
            continue
    
    for data in tqdm(dataset.yield_bootstrap_by_class(num_samples, batch_size), total=num_samples):
        for key in data.keys():
            try:
                inputs, targets = data[key]
                if torch.cuda.is_available:
                    inputs, targets = inputs.cuda(), targets.cuda()
                outputs, pointers = model(inputs)
                acc, rec, prec = accuracy(pointers, targets), recall(pointers, targets), precision(pointers, targets)
                log = {
                    'accuracy': acc,
                    'recall': rec,
                    'precision': prec
                }
                if key == '1to0':
                    logs_1to0.append(log)
                elif key == '1to1':
                    logs_1to1.append(log)
                elif key == '1toN':
                    logs_1toN.append(log)
            except:
                continue
        
    combined_logs = pd.DataFrame(combined_logs)
    combined_logs.to_csv('logging/combined_bootstrap_analysis_batchsize{}_{}.txt'.format(batch_size, version), index=False)
    
    logs_1to1 = pd.DataFrame(logs_1to1)
    logs_1to1.to_csv('logging/o2o_bootstrap_analysis_batchsize{}_{}.txt'.format(batch_size, version), index=False)
    
    logs_1to0 = pd.DataFrame(logs_1to0)
    logs_1to0.to_csv('logging/o2z_bootstrap_analysis_batchsize{}_{}.txt'.format(batch_size, version), index=False)
    
    logs_1toN = pd.DataFrame(logs_1toN)
    logs_1toN.to_csv('logging/o2n_bootstrap_analysis_batchsize{}_{}.txt'.format(batch_size, version), index=False)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:33<00:00,  3.02it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:54<00:00,  1.85it/s]
