In [1]:
import torch
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
from torch.utils.data import DataLoader
import numpy as np

from tqdm import tqdm
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from utils.datasets import SchemaMatchingDataset
from models.pointer_net import PointerNet
from utils.metrics import accuracy, precision, recall

from torch.utils.data import DataLoader

In [2]:
params = {
    # Data
    'batch_size': 128,
    'trainsplit': 0.8,
    'shuffle': True,
    #Train
    'nof_epoch': 150,
    'lr': 0.001,
    # GPU
    'gpu': True,
    # Network
    'input_size': 300,
    'embedding_size': 300,
    'hiddens': 256,
    'nof_lstms': 2,
    'dropout': 0.3,
    'bidir': True
}

In [3]:
#dataset = SchemaMatchingDataset(None, from_path=True)
#dataset.load('data/training')

train_dataset = SchemaMatchingDataset(None, from_path=True)
train_dataset.load('data/training', suffix='train')

test_dataset = SchemaMatchingDataset(None, from_path=True)
test_dataset.load('data/training', suffix='test')

#splits = dataset.split(0.8)
#train_dataloader = DataLoader(splits['train']['data'],
train_dataloader = DataLoader(train_dataset,
                        batch_size=params['batch_size'],
                        shuffle=params['shuffle'],
                        collate_fn=train_dataset.collate)
#test_dataloader = DataLoader(splits['test']['data'],
test_dataloader = DataLoader(test_dataset,
                        batch_size=params['batch_size'],
                        shuffle=params['shuffle'],
                        collate_fn=test_dataset.collate)

In [4]:
def run_training(suffix, model, model_optim, CCE):
    best_acc = 0
    logs = [] #plotting curve
    for i_epoch, epoch in enumerate(range(params['nof_epoch'])):
        train_accs, test_accs = [], []
        train_recalls, test_recalls = [], []
        train_precisions, test_precisions = [], []

        train_iterator = tqdm(train_dataloader, unit='Batch')
        for i_batch, sample_batched in enumerate(train_iterator):
            # training
            model.train()
            train_iterator.set_description('Epoch %i/%i' % (epoch+1, params['nof_epoch']))

            x, y = sample_batched
            train_batch = Variable(x).float()
            target_batch = Variable(y)

            if torch.cuda.is_available():
                train_batch = train_batch.cuda()
                target_batch = target_batch.cuda()

            o, p = model(train_batch)
            train_acc = accuracy(p, target_batch)
            train_recall = recall(p, target_batch)
            train_precision = precision(p, target_batch)
            train_accs.append(train_acc), train_recalls.append(train_recall), train_precisions.append(train_precision)

            o = o.contiguous().view(-1, o.size()[-1])
            target_batch_pressed = target_batch.view(-1)

            loss = CCE(o, target_batch_pressed)

            model_optim.zero_grad()
            loss.backward()
            model_optim.step()

            train_iterator.set_postfix(train_acc='{}'.format(train_acc))
        train_iterator.set_postfix(train_acc='{}'.format(np.average(train_accs)))    

        test_iterator = tqdm(test_dataloader, unit='Batch')
        for i_batch, sample_batched in enumerate(test_iterator):
            model.eval()
            x, y = sample_batched
            train_batch = Variable(x).float()
            target_batch = Variable(y)

            if torch.cuda.is_available():
                train_batch = train_batch.cuda()
                target_batch = target_batch.cuda()

            o, p = model(train_batch)
            test_acc = accuracy(p, target_batch)
            test_recall = recall(p, target_batch)
            test_precision = recall(p, target_batch)
            test_accs.append(test_acc), test_recalls.append(test_recall), test_precisions.append(test_precision)
            test_iterator.set_postfix(test_acc='{}'.format(test_acc))
        avg_acc = np.average(test_accs)
        test_iterator.set_postfix(test_acc='{}'.format(avg_acc))
        if avg_acc > best_acc:
            best_acc = avg_acc
            model.serialize('serialized/schema_pointer_{}.pt'.format(suffix))

        log = {
            'epoch': i_epoch + 1,
            'train_accuracy': np.average(train_accs),
            'test_accuracy': np.average(test_accs),
            'train_recall': np.average(train_recalls),
            'test_recall': np.average(test_recalls),
            'train_precision': np.average(train_precisions),
            'test_precision': np.average(test_precisions)
        }
        logs.append(log)

        # each epoch, reduce the learning rate
        for param in model_optim.param_groups:
                param['lr'] *= 0.95
                
    logs = pd.DataFrame(logs)
    logs.to_csv('logging/schema_pointer_{}.txt'.format(suffix), index=False)

In [5]:
for version in ['sp']:
    # np = no pretraining
    # ap = pretraining on alphabet sorting
    # sp = pretraining on 1to1 schema pointing
    model = PointerNet(params['input_size'],
                       params['embedding_size'],
                       params['hiddens'],
                       params['nof_lstms'],
                       params['dropout'],
                       params['bidir'])

    if version == 'np':
        pass
    elif version == 'ap':
        model.initialize('serialized/alphabet_pointer.pt')
    else:
        model.initialize('serialized/schema_pointer.pt')

    if params['gpu'] and torch.cuda.is_available():
        model.cuda()
        cudnn.benchmark = True

    CCE = torch.nn.CrossEntropyLoss()
    model_optim = optim.Adam(filter(lambda p: p.requires_grad,
                                    model.parameters()),
                                     lr=params['lr'])
    run_training(version, model, model_optim, CCE)

Epoch 1/10: 100%|████████████████████████████████████| 88/88 [00:56<00:00,  1.55Batch/s, train_acc=0.49489795918367346]
100%|█████████████████████████████████████████████████| 22/22 [00:08<00:00,  2.56Batch/s, test_acc=0.34362852283770656]
Epoch 2/10: 100%|█████████████████████████████████████| 88/88 [00:51<00:00,  1.70Batch/s, train_acc=0.7238095238095238]
100%|███████████████████████████████████████████████████| 22/22 [00:08<00:00,  2.67Batch/s, test_acc=0.624234693877551]
Epoch 3/10: 100%|█████████████████████████████████████| 88/88 [00:51<00:00,  1.70Batch/s, train_acc=0.8214285714285714]
100%|██████████████████████████████████████████████████| 22/22 [00:08<00:00,  2.45Batch/s, test_acc=0.6556608357628765]
Epoch 4/10: 100%|█████████████████████████████████████| 88/88 [00:52<00:00,  1.68Batch/s, train_acc=0.7380952380952381]
100%|██████████████████████████████████████████████████| 22/22 [00:08<00:00,  2.70Batch/s, test_acc=0.6592565597667639]
Epoch 5/10: 100%|███████████████████████

In [6]:
model.serialize('serialized/schema_pointer_sp.pt')