In [1]:
import time
time.sleep(60 * 60 * 3)

In [2]:
import torch
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
from torch.utils.data import DataLoader
import numpy as np

from tqdm import tqdm
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from utils.datasets import SchemaMatchingDataset
from models.pointer_net import PointerNet
from utils.metrics import accuracy, precision, recall

from torch.utils.data import DataLoader

In [3]:
params = {
    # Data
    'batch_size': 64,
    'trainsplit': 0.8,
    'shuffle': True,
    #Train
    'nof_epoch': 150,
    'lr': 0.001,
    # GPU
    'gpu': True,
    # Network
    'input_size': 300,
    'embedding_size': 300,
    'hiddens': 256,
    'nof_lstms': 2,
    'dropout': 0.3,
    'bidir': True
}

In [4]:
#dataset = SchemaMatchingDataset(None, from_path=True)
#dataset.load('data/training')

train_dataset = SchemaMatchingDataset(None, from_path=True)
train_dataset.load('data/training', suffix='train')

test_dataset = SchemaMatchingDataset(None, from_path=True)
test_dataset.load('data/training', suffix='test')

#splits = dataset.split(0.8)
#train_dataloader = DataLoader(splits['train']['data'],
train_dataloader = DataLoader(train_dataset,
                        batch_size=params['batch_size'],
                        shuffle=params['shuffle'],
                        collate_fn=train_dataset.collate)
#test_dataloader = DataLoader(splits['test']['data'],
test_dataloader = DataLoader(test_dataset,
                        batch_size=params['batch_size'],
                        shuffle=params['shuffle'],
                        collate_fn=test_dataset.collate)

In [5]:
def run_training(suffix, model, model_optim, CCE):
    logs = [] #plotting curve
    for i_epoch, epoch in enumerate(range(params['nof_epoch'])):
        train_accs, test_accs = [], []
        train_recalls, test_recalls = [], []
        train_precisions, test_precisions = [], []

        train_iterator = tqdm(train_dataloader, unit='Batch')
        for i_batch, sample_batched in enumerate(train_iterator):
            # training
            model.train()
            train_iterator.set_description('Epoch %i/%i' % (epoch+1, params['nof_epoch']))

            x, y = sample_batched
            train_batch = Variable(x).float()
            target_batch = Variable(y)

            if torch.cuda.is_available():
                train_batch = train_batch.cuda()
                target_batch = target_batch.cuda()

            o, p = model(train_batch)
            train_acc = accuracy(p, target_batch)
            train_recall = recall(p, target_batch)
            train_precision = precision(p, target_batch)
            train_accs.append(train_acc), train_recalls.append(train_recall), train_precisions.append(train_precision)

            o = o.contiguous().view(-1, o.size()[-1])
            target_batch_pressed = target_batch.view(-1)

            loss = CCE(o, target_batch_pressed)

            model_optim.zero_grad()
            loss.backward()
            model_optim.step()

            train_iterator.set_postfix(train_acc='{}'.format(train_acc))
        train_iterator.set_postfix(train_acc='{}'.format(np.average(train_accs)))    

        test_iterator = tqdm(test_dataloader, unit='Batch')
        for i_batch, sample_batched in enumerate(test_iterator):
            model.eval()
            x, y = sample_batched
            train_batch = Variable(x).float()
            target_batch = Variable(y)

            if torch.cuda.is_available():
                train_batch = train_batch.cuda()
                target_batch = target_batch.cuda()

            o, p = model(train_batch)
            test_acc = accuracy(p, target_batch)
            test_recall = recall(p, target_batch)
            test_precision = recall(p, target_batch)
            test_accs.append(test_acc), test_recalls.append(test_recall), test_precisions.append(test_precision)
            test_iterator.set_postfix(test_acc='{}'.format(test_acc))
        test_iterator.set_postfix(test_acc='{}'.format(np.average(test_accs)))       

        log = {
            'epoch': i_epoch + 1,
            'train_accuracy': np.average(train_accs),
            'test_accuracy': np.average(test_accs),
            'train_recall': np.average(train_recalls),
            'test_recall': np.average(test_recalls),
            'train_precision': np.average(train_precisions),
            'test_precision': np.average(test_precisions)
        }
        logs.append(log)

        # each epoch, reduce the learning rate
        for param in model_optim.param_groups:
                param['lr'] *= 0.95
                
    model.serialize('serialized/schema_pointer_{}.pt'.format(suffix))
    logs = pd.DataFrame(logs)
    logs.to_csv('logging/schema_pointer_{}.txt'.format(suffix), index=False)

In [6]:
for version in ['sp']:
    # np = no pretraining
    # ap = pretraining on alphabet sorting
    # sp = pretraining on 1to1 schema pointing
    model = PointerNet(params['input_size'],
                       params['embedding_size'],
                       params['hiddens'],
                       params['nof_lstms'],
                       params['dropout'],
                       params['bidir'])

    if version == 'np':
        pass
    elif version == 'ap':
        model.initialize('serialized/alphabet_pointer.pt')
    else:
        model.initialize('serialized/schema_pointer.pt')

    if params['gpu'] and torch.cuda.is_available():
        model.cuda()
        cudnn.benchmark = True

    CCE = torch.nn.CrossEntropyLoss()
    model_optim = optim.Adam(filter(lambda p: p.requires_grad,
                                    model.parameters()),
                                     lr=params['lr'])
    run_training(version, model, model_optim, CCE)

Epoch 1/150: 100%|██████████████████████████████████| 193/193 [01:05<00:00,  2.93Batch/s, train_acc=0.6611842105263158]
100%|█████████████████████████████████████████████████████████████████| 49/49 [00:08<00:00,  5.93Batch/s, test_acc=0.5]
Epoch 2/150: 100%|██████████████████████████████████| 193/193 [01:03<00:00,  3.02Batch/s, train_acc=0.5373626373626373]
100%|██████████████████████████████████████████████████| 49/49 [00:07<00:00,  6.14Batch/s, test_acc=0.6923076923076923]
Epoch 3/150: 100%|██████████████████████████████████| 193/193 [01:03<00:00,  3.04Batch/s, train_acc=0.5131578947368421]
100%|██████████████████████████████████████████████████| 49/49 [00:08<00:00,  6.01Batch/s, test_acc=0.3076923076923077]
Epoch 4/150: 100%|██████████████████████████████████| 193/193 [01:03<00:00,  3.04Batch/s, train_acc=0.5508771929824562]
100%|█████████████████████████████████████████████████| 49/49 [00:08<00:00,  5.99Batch/s, test_acc=0.48717948717948717]
Epoch 5/150: 100%|██████████████████████

Epoch 35/150: 100%|██████████████████████████████████████████████| 193/193 [01:02<00:00,  3.08Batch/s, train_acc=0.775]
100%|██████████████████████████████████████████████████| 49/49 [00:08<00:00,  6.06Batch/s, test_acc=0.6217948717948718]
Epoch 36/150: 100%|█████████████████████████████████| 193/193 [01:03<00:00,  3.04Batch/s, train_acc=0.7293375484164958]
100%|██████████████████████████████████████████████████| 49/49 [00:08<00:00,  6.10Batch/s, test_acc=0.8461538461538461]
Epoch 37/150: 100%|█████████████████████████████████| 193/193 [01:02<00:00,  3.07Batch/s, train_acc=0.8881578947368421]
100%|███████████████████████████████████████████████████| 49/49 [00:07<00:00,  6.22Batch/s, test_acc=0.823076923076923]
Epoch 38/150: 100%|█████████████████████████████████| 193/193 [01:03<00:00,  3.05Batch/s, train_acc=0.8210526315789474]
100%|████████████████████████████████████████████████████████████████| 49/49 [00:08<00:00,  6.08Batch/s, test_acc=0.75]
Epoch 39/150: 100%|█████████████████████

Epoch 69/150: 100%|█████████████████████████████████| 193/193 [01:02<00:00,  3.07Batch/s, train_acc=0.7449874686716791]
100%|██████████████████████████████████████████████████| 49/49 [00:07<00:00,  6.20Batch/s, test_acc=0.7692307692307693]
Epoch 70/150: 100%|█████████████████████████████████| 193/193 [01:03<00:00,  3.04Batch/s, train_acc=0.7307283359914939]
100%|██████████████████████████████████████████████████| 49/49 [00:07<00:00,  6.40Batch/s, test_acc=0.6709401709401709]
Epoch 71/150: 100%|█████████████████████████████████| 193/193 [01:02<00:00,  3.06Batch/s, train_acc=0.8177033492822967]
100%|██████████████████████████████████████████████████| 49/49 [00:07<00:00,  6.22Batch/s, test_acc=0.5769230769230769]
Epoch 72/150: 100%|██████████████████████████████████| 193/193 [01:03<00:00,  3.05Batch/s, train_acc=0.756578947368421]
100%|██████████████████████████████████████████████████| 49/49 [00:07<00:00,  6.13Batch/s, test_acc=0.7307692307692307]
Epoch 73/150: 100%|█████████████████████

Epoch 103/150: 100%|█████████████████████████████████████████████| 193/193 [01:02<00:00,  3.09Batch/s, train_acc=0.775]
100%|█████████████████████████████████████████████████| 49/49 [00:08<00:00,  6.05Batch/s, test_acc=0.48717948717948717]
Epoch 104/150: 100%|█████████████████████████████████| 193/193 [01:03<00:00,  3.04Batch/s, train_acc=0.706766917293233]
100%|██████████████████████████████████████████████████| 49/49 [00:08<00:00,  6.10Batch/s, test_acc=0.7948717948717949]
Epoch 105/150: 100%|████████████████████████████████| 193/193 [01:03<00:00,  3.03Batch/s, train_acc=0.8423582995951417]
100%|██████████████████████████████████████████████████| 49/49 [00:07<00:00,  6.15Batch/s, test_acc=0.6435897435897436]
Epoch 106/150: 100%|████████████████████████████████| 193/193 [01:03<00:00,  3.04Batch/s, train_acc=0.8026315789473685]
100%|██████████████████████████████████████████████████| 49/49 [00:07<00:00,  6.14Batch/s, test_acc=0.8333333333333333]
Epoch 107/150: 100%|████████████████████

Epoch 137/150: 100%|████████████████████████████████| 193/193 [01:03<00:00,  3.06Batch/s, train_acc=0.7903508771929825]
100%|██████████████████████████████████████████████████| 49/49 [00:08<00:00,  6.01Batch/s, test_acc=0.5487179487179488]
Epoch 138/150: 100%|█████████████████████████████████| 193/193 [01:03<00:00,  3.05Batch/s, train_acc=0.666786283891547]
100%|██████████████████████████████████████████████████| 49/49 [00:08<00:00,  6.12Batch/s, test_acc=0.5564102564102564]
Epoch 139/150: 100%|████████████████████████████████| 193/193 [01:03<00:00,  3.04Batch/s, train_acc=0.8013271815903394]
100%|██████████████████████████████████████████████████| 49/49 [00:07<00:00,  6.23Batch/s, test_acc=0.6275391275391276]
Epoch 140/150: 100%|████████████████████████████████| 193/193 [01:03<00:00,  3.05Batch/s, train_acc=0.7362155388471178]
100%|██████████████████████████████████████████████████| 49/49 [00:08<00:00,  6.03Batch/s, test_acc=0.7769230769230769]
Epoch 141/150: 100%|████████████████████

In [7]:
model.serialize('serialized/schema_pointer_sp.pt')