In [1]:
import torch
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
from torch.utils.data import DataLoader
import numpy as np

from tqdm import tqdm
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from utils.datasets import SchemaMatchingDataset
from models.pointer_net import PointerNet
from utils.metrics import accuracy, precision, recall

from torch.utils.data import DataLoader

In [2]:
params = {
    # Data
    'batch_size': 128,
    'trainsplit': 0.8,
    'shuffle': True,
    #Train
    'nof_epoch': 150,
    'lr': 0.001,
    # GPU
    'gpu': True,
    # Network
    'input_size': 300,
    'embedding_size': 300,
    'hiddens': 256,
    'nof_lstms': 2,
    'dropout': 0.3,
    'bidir': True
}

In [3]:
#dataset = SchemaMatchingDataset(None, from_path=True)
#dataset.load('data/training')

train_dataset = SchemaMatchingDataset(None, from_path=True)
train_dataset.load('data/training', suffix='train')

test_dataset = SchemaMatchingDataset(None, from_path=True)
test_dataset.load('data/training', suffix='test')

#splits = dataset.split(0.8)
#train_dataloader = DataLoader(splits['train']['data'],
train_dataloader = DataLoader(train_dataset,
                        batch_size=params['batch_size'],
                        shuffle=params['shuffle'],
                        collate_fn=train_dataset.collate)
#test_dataloader = DataLoader(splits['test']['data'],
test_dataloader = DataLoader(test_dataset,
                        batch_size=params['batch_size'],
                        shuffle=params['shuffle'],
                        collate_fn=test_dataset.collate)

In [4]:
def run_training(suffix, model, model_optim, CCE):
    best_acc = 0
    logs = [] #plotting curve
    for i_epoch, epoch in enumerate(range(params['nof_epoch'])):
        train_accs, test_accs = [], []
        train_recalls, test_recalls = [], []
        train_precisions, test_precisions = [], []

        train_iterator = tqdm(train_dataloader, unit='Batch')
        for i_batch, sample_batched in enumerate(train_iterator):
            # training
            model.train()
            train_iterator.set_description('Epoch %i/%i' % (epoch+1, params['nof_epoch']))

            x, y = sample_batched
            train_batch = Variable(x).float()
            target_batch = Variable(y)

            if torch.cuda.is_available():
                train_batch = train_batch.cuda()
                target_batch = target_batch.cuda()

            o, p = model(train_batch)
            train_acc = accuracy(p, target_batch)
            train_recall = recall(p, target_batch)
            train_precision = precision(p, target_batch)
            train_accs.append(train_acc), train_recalls.append(train_recall), train_precisions.append(train_precision)

            o = o.contiguous().view(-1, o.size()[-1])
            target_batch_pressed = target_batch.view(-1)

            loss = CCE(o, target_batch_pressed)

            model_optim.zero_grad()
            loss.backward()
            model_optim.step()

            train_iterator.set_postfix(train_acc='{}'.format(train_acc))
        train_iterator.set_postfix(train_acc='{}'.format(np.average(train_accs)))    

        test_iterator = tqdm(test_dataloader, unit='Batch')
        for i_batch, sample_batched in enumerate(test_iterator):
            model.eval()
            x, y = sample_batched
            train_batch = Variable(x).float()
            target_batch = Variable(y)

            if torch.cuda.is_available():
                train_batch = train_batch.cuda()
                target_batch = target_batch.cuda()

            o, p = model(train_batch)
            test_acc = accuracy(p, target_batch)
            test_recall = recall(p, target_batch)
            test_precision = recall(p, target_batch)
            test_accs.append(test_acc), test_recalls.append(test_recall), test_precisions.append(test_precision)
            test_iterator.set_postfix(test_acc='{}'.format(test_acc))
        avg_acc = np.average(test_accs)
        test_iterator.set_postfix(test_acc='{}'.format(avg_acc))
        if avg_acc > best_acc:
            best_acc = avg_acc
            model.serialize('serialized/schema_pointer_{}.pt'.format(suffix))

        log = {
            'epoch': i_epoch + 1,
            'train_accuracy': np.average(train_accs),
            'test_accuracy': np.average(test_accs),
            'train_recall': np.average(train_recalls),
            'test_recall': np.average(test_recalls),
            'train_precision': np.average(train_precisions),
            'test_precision': np.average(test_precisions)
        }
        logs.append(log)

        # each epoch, reduce the learning rate
        for param in model_optim.param_groups:
                param['lr'] *= 0.95
                
    logs = pd.DataFrame(logs)
    logs.to_csv('logging/schema_pointer_{}.txt'.format(suffix), index=False)

In [5]:
for version in ['sp']:
    # np = no pretraining
    # ap = pretraining on alphabet sorting
    # sp = pretraining on 1to1 schema pointing
    model = PointerNet(params['input_size'],
                       params['embedding_size'],
                       params['hiddens'],
                       params['nof_lstms'],
                       params['dropout'],
                       params['bidir'])

    if version == 'np':
        pass
    elif version == 'ap':
        model.initialize('serialized/alphabet_pointer.pt')
    else:
        model.initialize('serialized/schema_pointer.pt')

    if params['gpu'] and torch.cuda.is_available():
        model.cuda()
        cudnn.benchmark = True

    CCE = torch.nn.CrossEntropyLoss()
    model_optim = optim.Adam(filter(lambda p: p.requires_grad,
                                    model.parameters()),
                                     lr=params['lr'])
    run_training(version, model, model_optim, CCE)

Epoch 1/150: 100%|██████████████████████████████████| 175/175 [01:17<00:00,  2.25Batch/s, train_acc=0.5333333333333333]
100%|██████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.21Batch/s, test_acc=0.5036764705882353]
Epoch 2/150: 100%|██████████████████████████████████| 175/175 [01:14<00:00,  2.36Batch/s, train_acc=0.6071428571428571]
100%|███████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.19Batch/s, test_acc=0.622549019607843]
Epoch 3/150: 100%|██████████████████████████████████| 175/175 [01:13<00:00,  2.39Batch/s, train_acc=0.4880952380952381]
100%|██████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.38Batch/s, test_acc=0.6801470588235294]
Epoch 4/150: 100%|██████████████████████████████████| 175/175 [01:12<00:00,  2.40Batch/s, train_acc=0.7857142857142857]
100%|██████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.30Batch/s, test_acc=0.7696078431372548]
Epoch 5/150: 100%|██████████████████████

Epoch 35/150: 100%|█████████████████████████████████| 175/175 [01:12<00:00,  2.40Batch/s, train_acc=0.9523809523809523]
100%|███████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.37Batch/s, test_acc=0.780398523045582]
Epoch 36/150: 100%|█████████████████████████████████| 175/175 [01:11<00:00,  2.44Batch/s, train_acc=0.7916666666666666]
100%|██████████████████████████████████████████████████| 44/44 [00:09<00:00,  4.41Batch/s, test_acc=0.7250000000000001]
Epoch 37/150: 100%|██████████████████████████████████| 175/175 [01:12<00:00,  2.42Batch/s, train_acc=0.808913308913309]
100%|██████████████████████████████████████████████████| 44/44 [00:09<00:00,  4.44Batch/s, test_acc=0.8022058823529411]
Epoch 38/150: 100%|█████████████████████████████████| 175/175 [01:12<00:00,  2.40Batch/s, train_acc=0.8095238095238095]
100%|██████████████████████████████████████████████████| 44/44 [00:09<00:00,  4.41Batch/s, test_acc=0.8345588235294118]
Epoch 39/150: 100%|█████████████████████

Epoch 69/150: 100%|█████████████████████████████████| 175/175 [01:11<00:00,  2.44Batch/s, train_acc=0.7936507936507935]
100%|███████████████████████████████████████████████████| 44/44 [00:09<00:00,  4.41Batch/s, test_acc=0.848529411764706]
Epoch 70/150: 100%|█████████████████████████████████| 175/175 [01:11<00:00,  2.44Batch/s, train_acc=0.9761904761904762]
100%|███████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.36Batch/s, test_acc=0.732843137254902]
Epoch 71/150: 100%|█████████████████████████████████| 175/175 [01:11<00:00,  2.44Batch/s, train_acc=0.8253968253968255]
100%|██████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.36Batch/s, test_acc=0.8455882352941176]
Epoch 72/150: 100%|█████████████████████████████████| 175/175 [01:12<00:00,  2.42Batch/s, train_acc=0.8928571428571429]
100%|██████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.38Batch/s, test_acc=0.8382352941176471]
Epoch 73/150: 100%|█████████████████████

Epoch 103/150: 100%|████████████████████████████████| 175/175 [01:12<00:00,  2.42Batch/s, train_acc=0.9166666666666666]
100%|██████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.36Batch/s, test_acc=0.8534313725490197]
Epoch 104/150: 100%|████████████████████████████████| 175/175 [01:12<00:00,  2.41Batch/s, train_acc=0.8380952380952381]
100%|██████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.37Batch/s, test_acc=0.7907754010695187]
Epoch 105/150: 100%|████████████████████████████████| 175/175 [01:11<00:00,  2.43Batch/s, train_acc=0.9047619047619048]
100%|██████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.37Batch/s, test_acc=0.8151960784313727]
Epoch 106/150: 100%|████████████████████████████████| 175/175 [01:11<00:00,  2.45Batch/s, train_acc=0.7619047619047619]
100%|██████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.36Batch/s, test_acc=0.7783613445378151]
Epoch 107/150: 100%|████████████████████

Epoch 137/150: 100%|████████████████████████████████| 175/175 [01:12<00:00,  2.41Batch/s, train_acc=0.8642857142857142]
100%|██████████████████████████████████████████████████| 44/44 [00:09<00:00,  4.42Batch/s, test_acc=0.8345588235294118]
Epoch 138/150: 100%|████████████████████████████████| 175/175 [01:12<00:00,  2.43Batch/s, train_acc=0.9166666666666666]
100%|██████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.36Batch/s, test_acc=0.8235294117647058]
Epoch 139/150: 100%|████████████████████████████████| 175/175 [01:12<00:00,  2.42Batch/s, train_acc=0.8706349206349205]
100%|██████████████████████████████████████████████████| 44/44 [00:09<00:00,  4.41Batch/s, test_acc=0.7071078431372548]
Epoch 140/150: 100%|████████████████████████████████| 175/175 [01:12<00:00,  2.41Batch/s, train_acc=0.8650793650793651]
100%|██████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.38Batch/s, test_acc=0.7806372549019607]
Epoch 141/150: 100%|████████████████████

In [6]:
model.serialize('serialized/schema_pointer_sp.pt')