In [1]:
import torch
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
from torch.utils.data import DataLoader
import numpy as np

from tqdm import tqdm
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from utils.datasets import SchemaMatchingDataset
from models.pointer_net import PointerNet
from torch.utils.data import DataLoader

In [2]:
params = {
    # Data
    'batch_size': 64,
    'trainsplit': 0.7,
    'shuffle': True,
    'nof_workers': 0, # must stay at 0
    #Train
    'nof_epoch': 1000,
    'lr': 0.001,
    # GPU
    'gpu': True,
    # Network
    'input_size': 300,
    'embedding_size': 300,
    'hiddens': 256,
    'nof_lstms': 2,
    'dropout': 0.3,
    'bidir': True
}

In [3]:
def collate(batch):
    x, y = [entry[0] for entry in batch], [entry[1] for entry in batch]
    max_len = max([len(sequence) for sequence in x])
    for idx, (sequence, true) in enumerate(zip(x, y)):
        len_padding = max_len - len(sequence)
        if len_padding > 0:
            target_padding = torch.Tensor([len(sequence) for _ in range(len_padding)]).long()
            sequence_padding = torch.zeros(len_padding, params['input_size'])
            true = torch.cat([true, target_padding])
            sequence = torch.cat([sequence, sequence_padding])
            x[idx] = sequence
            y[idx] = true
    # make tensors out of list of tensors
    x = torch.cat([torch.unsqueeze(sequence, dim=0) for sequence in x], dim=0)
    y = torch.cat([torch.unsqueeze(targets, dim=0) for targets in y], dim=0)
    return x, y

In [4]:
dataset = SchemaMatchingDataset(None, from_path=True)
dataset.load('data/training')

train_dataset, test_dataset = dataset.split(0.8)
train_dataloader = DataLoader(train_dataset,
                        batch_size=params['batch_size'],
                        shuffle=params['shuffle'],
                        collate_fn=collate)
test_dataloader = DataLoader(test_dataset,
                        batch_size=params['batch_size'],
                        shuffle=params['shuffle'],
                        collate_fn=collate)

In [5]:
model = PointerNet(params['input_size'],
                   params['embedding_size'],
                   params['hiddens'],
                   params['nof_lstms'],
                   params['dropout'],
                   params['bidir'])
#model.initialize('serialized/alphabet_pointer.pt')
model.initialize('serialized/schema_pointer.pt')

if params['gpu'] and torch.cuda.is_available():
    model.cuda()
    net = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count()))
    cudnn.benchmark = True

CCE = torch.nn.CrossEntropyLoss()
model_optim = optim.Adam(filter(lambda p: p.requires_grad,
                                model.parameters()),
                                 lr=params['lr'])

In [6]:
def accuracy(p, y):
    correct = 0
    for pred, true in zip(p, y):
        correct += (pred[0].cpu().numpy() == true[0].cpu().numpy())
    accuracy = (correct / len(p))
    return accuracy

In [7]:
logs = [] #plotting curve
for i_epoch, epoch in enumerate(range(params['nof_epoch'])):
    train_accs, test_accs = [], []
    
    train_iterator = tqdm(train_dataloader, unit='Batch')
    for i_batch, sample_batched in enumerate(train_iterator):
        # training
        model.train()
        train_iterator.set_description('Epoch %i/%i' % (epoch+1, params['nof_epoch']))

        x, y = sample_batched
        train_batch = Variable(x).float()
        target_batch = Variable(y)

        if torch.cuda.is_available():
            train_batch = train_batch.cuda()
            target_batch = target_batch.cuda()

        o, p = model(train_batch)
        train_acc = accuracy(p, target_batch)
        train_accs.append(train_acc)
        
        o = o.contiguous().view(-1, o.size()[-1])
        target_batch_pressed = target_batch.view(-1)
        
        loss = CCE(o, target_batch_pressed)

        model_optim.zero_grad()
        loss.backward()
        model_optim.step()
        
        train_iterator.set_postfix(train_acc='{}'.format(train_acc))
    train_iterator.set_postfix(train_acc='{}'.format(np.average(train_accs)))    
    
    test_iterator = tqdm(test_dataloader, unit='Batch')
    for i_batch, sample_batched in enumerate(test_iterator):
        model.eval()
        x, y = sample_batched
        train_batch = Variable(x).float()
        target_batch = Variable(y)

        if torch.cuda.is_available():
            train_batch = train_batch.cuda()
            target_batch = target_batch.cuda()

        o, p = model(train_batch)
        test_acc = accuracy(p, target_batch)
        test_accs.append(test_acc)
        test_iterator.set_postfix(test_acc='{}'.format(test_acc))
    test_iterator.set_postfix(test_acc='{}'.format(np.average(test_accs)))       
    
    log = {
        'epoch': i_epoch + 1,
        'train_acc': np.average(train_accs),
        'test_acc': np.average(test_accs)
    }
    logs.append(log)
        
    # each epoch, reduce the learning rate
    for param in model_optim.param_groups:
            param['lr'] *= 0.95

  0%|                                                                         | 0/1 [00:00<?, ?Batch/s]

11


NameError: name 'trues' is not defined

In [None]:
y

In [None]:
p

In [None]:
#model.serialize('serialized/schema_pointer_transferlearning_1toN.pt')

In [None]:
logs = pd.DataFrame(logs)
logs.to_csv('logging/schema_pointer_transferlearning_1toN.txt', index=False)