In [1]:
import torch
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
from torch.utils.data import DataLoader
import numpy as np

from tqdm import tqdm
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from utils.datasets import SchemaMatchingDataset
from models.pointer_net import PointerNet
from utils.metrics import accuracy, precision, recall

from torch.utils.data import DataLoader

In [2]:
params = {
    # Data
    'batch_size': 64,
    'trainsplit': 0.8,
    'shuffle': True,
    #Train
    'nof_epoch': 150,
    'lr': 0.001,
    # GPU
    'gpu': True,
    # Network
    'input_size': 300,
    'embedding_size': 300,
    'hiddens': 256,
    'nof_lstms': 2,
    'dropout': 0.3,
    'bidir': True
}

In [3]:
dataset = SchemaMatchingDataset(None, from_path=True)
dataset.load('data/training')

splits = dataset.split(0.8)
train_dataloader = DataLoader(splits['train']['data'],
                        batch_size=params['batch_size'],
                        shuffle=params['shuffle'],
                        collate_fn=dataset.collate)
test_dataloader = DataLoader(splits['test']['data'],
                        batch_size=params['batch_size'],
                        shuffle=params['shuffle'],
                        collate_fn=dataset.collate)

In [4]:
def run_training(suffix, model, model_optim, CCE):
    logs = [] #plotting curve
    for i_epoch, epoch in enumerate(range(params['nof_epoch'])):
        train_accs, test_accs = [], []
        train_recalls, test_recalls = [], []
        train_precisions, test_precisions = [], []

        train_iterator = tqdm(train_dataloader, unit='Batch')
        for i_batch, sample_batched in enumerate(train_iterator):
            # training
            model.train()
            train_iterator.set_description('Epoch %i/%i' % (epoch+1, params['nof_epoch']))

            x, y = sample_batched
            train_batch = Variable(x).float()
            target_batch = Variable(y)

            if torch.cuda.is_available():
                train_batch = train_batch.cuda()
                target_batch = target_batch.cuda()

            o, p = model(train_batch)
            train_acc = accuracy(p, target_batch)
            train_recall = recall(p, target_batch)
            train_precision = precision(p, target_batch)
            train_accs.append(train_acc), train_recalls.append(train_recall), train_precisions.append(train_precision)

            o = o.contiguous().view(-1, o.size()[-1])
            target_batch_pressed = target_batch.view(-1)

            loss = CCE(o, target_batch_pressed)

            model_optim.zero_grad()
            loss.backward()
            model_optim.step()

            train_iterator.set_postfix(train_acc='{}'.format(train_acc))
        train_iterator.set_postfix(train_acc='{}'.format(np.average(train_accs)))    

        test_iterator = tqdm(test_dataloader, unit='Batch')
        for i_batch, sample_batched in enumerate(test_iterator):
            model.eval()
            x, y = sample_batched
            train_batch = Variable(x).float()
            target_batch = Variable(y)

            if torch.cuda.is_available():
                train_batch = train_batch.cuda()
                target_batch = target_batch.cuda()

            o, p = model(train_batch)
            test_acc = accuracy(p, target_batch)
            test_recall = recall(p, target_batch)
            test_precision = recall(p, target_batch)
            test_accs.append(test_acc), test_recalls.append(test_recall), test_precisions.append(test_precision)
            test_iterator.set_postfix(test_acc='{}'.format(test_acc))
        test_iterator.set_postfix(test_acc='{}'.format(np.average(test_accs)))       

        log = {
            'epoch': i_epoch + 1,
            'train_accuracy': np.average(train_accs),
            'test_accuracy': np.average(test_accs),
            'train_recall': np.average(train_recalls),
            'test_recall': np.average(test_recalls),
            'train_precision': np.average(train_precisions),
            'test_precision': np.average(test_precisions)
        }
        logs.append(log)

        # each epoch, reduce the learning rate
        for param in model_optim.param_groups:
                param['lr'] *= 0.95
                
    model.serialize('serialized/schema_pointer_{}.pt'.format(suffix))
    logs = pd.DataFrame(logs)
    logs.to_csv('logging/schema_pointer_{}.txt'.format(suffix), index=False)

In [5]:
for version in ['sp']: #['np', 'ap', 'sp']:
    # np = no pretraining
    # ap = pretraining on alphabet sorting
    # sp = pretraining on 1to1 schema pointing
    model = PointerNet(params['input_size'],
                       params['embedding_size'],
                       params['hiddens'],
                       params['nof_lstms'],
                       params['dropout'],
                       params['bidir'])

    if version == 'np':
        pass
    elif version == 'ap':
        model.initialize('serialized/alphabet_pointer.pt')
    else:
        model.initialize('serialized/schema_pointer.pt')

    if params['gpu'] and torch.cuda.is_available():
        model.cuda()
        cudnn.benchmark = True

    CCE = torch.nn.CrossEntropyLoss()
    model_optim = optim.Adam(filter(lambda p: p.requires_grad,
                                    model.parameters()),
                                     lr=params['lr'])
    run_training(version, model, model_optim, CCE)

Epoch 1/150: 100%|███████████████████████████████████████████| 118/118 [00:35<00:00,  3.33Batch/s, train_acc=0.5234375]
100%|███████████████████████████████████████████████████████████| 30/30 [00:05<00:00,  5.93Batch/s, test_acc=0.5390625]
Epoch 2/150: 100%|██████████████████████████████████████████| 118/118 [00:35<00:00,  3.34Batch/s, train_acc=0.69921875]
100%|███████████████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.00Batch/s, test_acc=0.5078125]
Epoch 3/150: 100%|██████████████████████████████████████████| 118/118 [00:34<00:00,  3.44Batch/s, train_acc=0.73828125]
100%|████████████████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.09Batch/s, test_acc=0.703125]
Epoch 4/150: 100%|██████████████████████████████████████████| 118/118 [00:34<00:00,  3.46Batch/s, train_acc=0.78515625]
100%|██████████████████████████████████████████████████| 30/30 [00:04<00:00,  6.94Batch/s, test_acc=0.6809895833333334]
Epoch 5/150: 100%|██████████████████████

Epoch 35/150: 100%|█████████████████████████████████| 118/118 [00:32<00:00,  3.60Batch/s, train_acc=0.9348958333333334]
100%|██████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.26Batch/s, test_acc=0.8880208333333334]
Epoch 36/150: 100%|█████████████████████████████████| 118/118 [00:35<00:00,  3.35Batch/s, train_acc=0.8776041666666667]
100%|████████████████████████████████████████████████████████████| 30/30 [00:03<00:00,  7.71Batch/s, test_acc=0.765625]
Epoch 37/150: 100%|██████████████████████████████████████████| 118/118 [00:32<00:00,  3.60Batch/s, train_acc=0.8828125]
100%|█████████████████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.32Batch/s, test_acc=0.88125]
Epoch 38/150: 100%|███████████████████████████████████████████| 118/118 [00:32<00:00,  3.59Batch/s, train_acc=0.909375]
100%|████████████████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.26Batch/s, test_acc=0.953125]
Epoch 39/150: 100%|█████████████████████

Epoch 69/150: 100%|█████████████████████████████████| 118/118 [00:34<00:00,  3.45Batch/s, train_acc=0.9192708333333334]
100%|██████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.24Batch/s, test_acc=0.8385416666666667]
Epoch 70/150: 100%|███████████████████████████████████████████| 118/118 [00:33<00:00,  3.49Batch/s, train_acc=0.915625]
100%|██████████████████████████████████████████████████| 30/30 [00:03<00:00,  7.51Batch/s, test_acc=0.8165178571428571]
Epoch 71/150: 100%|█████████████████████████████████| 118/118 [00:33<00:00,  3.53Batch/s, train_acc=0.8971354166666666]
100%|██████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.31Batch/s, test_acc=0.8697916666666666]
Epoch 72/150: 100%|█████████████████████████████████| 118/118 [00:33<00:00,  3.53Batch/s, train_acc=0.8786458333333333]
100%|██████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.01Batch/s, test_acc=0.8236607142857143]
Epoch 73/150: 100%|█████████████████████

Epoch 103/150: 100%|████████████████████████████████| 118/118 [00:35<00:00,  3.35Batch/s, train_acc=0.9114583333333334]
100%|██████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.44Batch/s, test_acc=0.8729166666666667]
Epoch 104/150: 100%|████████████████████████████████| 118/118 [00:32<00:00,  3.60Batch/s, train_acc=0.9244791666666666]
100%|████████████████████████████████████████████████████████████| 30/30 [00:03<00:00,  7.91Batch/s, test_acc=0.859375]
Epoch 105/150: 100%|████████████████████████████████| 118/118 [00:33<00:00,  3.49Batch/s, train_acc=0.8511904761904762]
100%|██████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.43Batch/s, test_acc=0.8229166666666666]
Epoch 106/150: 100%|██████████████████████████████████████████| 118/118 [00:33<00:00,  3.56Batch/s, train_acc=0.890625]
100%|██████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.29Batch/s, test_acc=0.8541666666666666]
Epoch 107/150: 100%|████████████████████

Epoch 137/150: 100%|████████████████████████████████| 118/118 [00:33<00:00,  3.55Batch/s, train_acc=0.8292410714285714]
100%|██████████████████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.49Batch/s, test_acc=0.9375]
Epoch 138/150: 100%|█████████████████████████████████████████████| 118/118 [00:34<00:00,  3.43Batch/s, train_acc=0.925]
100%|██████████████████████████████████████████████████| 30/30 [00:05<00:00,  5.82Batch/s, test_acc=0.8385416666666666]
Epoch 139/150: 100%|████████████████████████████████████████| 118/118 [00:32<00:00,  3.64Batch/s, train_acc=0.91796875]
100%|███████████████████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.33Batch/s, test_acc=0.625]
Epoch 140/150: 100%|████████████████████████████████| 118/118 [00:33<00:00,  3.55Batch/s, train_acc=0.9616815476190476]
100%|██████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.03Batch/s, test_acc=0.8854166666666666]
Epoch 141/150: 100%|████████████████████