In [1]:
import time
import math
import torch
import os
import matplotlib.pyplot as plt

from torch import nn
import torch.optim as O
import torch.nn.functional as F
from torchtext import data, vocab, datasets

In [2]:
class Parameters():
    def __init__(self):
        # gpu
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # word vectors
        self.embed_size = 50
        self.word_vectors = True
        self.glove_path = '/home/ndg/users/jkurre/mnli/utils/embeddings/glove.6B.50d.txt'
        # model configs
        self.hidden_size = 1024
        self.batch_size = 32
        self.input_size = 76790
        self.output_size = 4
        self.n_layers = 2
        self.n_cells = 4
        self.dropout = 0.5
        # training
        self.epochs = 5
        self.learning_rate = 0.0001
        self.outpath = '/home/ndg/users/jkurre/mnli/models/bilstm_revised.pt' # _onehot.pt
        self.outfile = '/outputs/bilstm_with_attention.txt'

params = Parameters()

In [3]:
inputs = data.Field(
    lower=True,
    tokenize='spacy'
)

answers = data.Field(
    sequential=False
)

train, val, test = datasets.MultiNLI.splits(
    text_field=inputs,
    label_field=answers
    )

inputs.build_vocab(train, val, test)

if params.word_vectors:
    inputs.vocab.load_vectors(vocab.Vectors(params.glove_path, cache="."))

answers.build_vocab(train)

params.n_embed = len(inputs.vocab)
params.d_out = len(answers.vocab)

print(f"Unique tokens in inputs vocabulary: {params.n_embed}")
print(f"Unique tokens in answers vocabulary: {params.d_out}")

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, val, test), batch_size=params.batch_size, device=params.device)

### Model

In [29]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 4, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.uniform_(-stdv, stdv)

    def forward(self, hidden, encoder_outputs):
        hidden = hidden.reshape((1, hidden.shape[1], hidden.shape[2] * 2))
        timestep = encoder_outputs.size(0)
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        encoder_outputs = encoder_outputs.transpose(0, 1)  # [B*T*H]
        attn_energies = self.score(h, encoder_outputs)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

    def score(self, hidden, encoder_outputs):
        # [B*T*2H]->[B*T*H]
        catted = torch.cat([hidden, encoder_outputs], 2)
        energy = F.relu(self.attn(catted))
        energy = energy.transpose(1, 2)  # [B*H*T]
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)  # [B*1*H]
        energy = torch.bmm(v, energy)  # [B*1*T]
        return energy.squeeze(1)  # [B*T]
    
class MultiNLIModel(nn.Module):
    def __init__(self, input_size, output_size, embed_size, device,
                 hidden_size, batch_size, dropout, n_layers, n_cells):
        
        super(MultiNLIModel, self).__init__()
        
        self.device = device
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.n_cells = n_cells
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.embed = nn.Embedding(input_size, embed_size)
        self.attention = Attention(hidden_size)
        self.lstm = nn.LSTM(embed_size, hidden_size,
                            num_layers=n_layers, dropout=dropout, 
                            bidirectional=True)
        
        self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size, bias=False)
        self.fc_output = nn.Linear(hidden_size,  output_size, bias=False)
    
    def encode(self, embed):
        # pass embedding input through lstm
        state_shape = self.n_cells, self.batch_size, self.hidden_size
        h0 = c0 = embed.new_zeros(state_shape)
        outputs, (ht, ct) = self.lstm(embed, (h0, c0))

        # pass outcomes through attention layer
        weights = self.attention(ht[-2:], outputs)
        context = weights.bmm(outputs.transpose(0, 1))
        context = context.transpose(0, 1)
        context = context.squeeze(0)
        return context
        
    def forward(self, pair):
        
        # seq_length, batch_size, embed_size
        prem_embed = self.dropout(self.embed(pair.premise))
        hypo_embed = self.dropout(self.embed(pair.hypothesis))
        
        prem_contx = self.encode(prem_embed)
        hypo_contx = self.encode(hypo_embed)
        
        # seq_len, hidden_size * 2
        pair_embed = prem_contx - hypo_contx
        pair_embed = self.relu(self.fc_hidden(pair_embed))
        
        # hidden_size * 2, output_size
        pair_output = self.relu(self.fc_output(pair_embed))
        
        return pair_output

In [30]:
model = MultiNLIModel(params.input_size, params.output_size, params.embed_size, params.device,
                      params.hidden_size, params.batch_size, params.dropout, params.n_layers, params.n_cells).to(params.device)

In [31]:
# https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation-batched.ipynb

criterion = nn.CrossEntropyLoss()
opt = O.Adam(model.parameters(), lr=params.learning_rate)

val_log_template = ' '.join('{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:8.6f},{:12.4f},{}'.split(','))
log_template =  ' '.join('{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{},{:12.4f},{}'.split(','))

iterations = 0
start = time.time()

acc_loss = []

for epoch in range(params.epochs):
    train_iterator.init_epoch()
    n_correct, n_total = 0, 0
    for batch_idx, batch in enumerate(train_iterator):
        
        # switch model to training mode, clear gradient accumulators
        model.train();
        opt.zero_grad()

        iterations += 1

        # forward pass
        answer = model(batch)
        
        # calculate accuracy of predictions in the current batch
        n_correct += (torch.max(answer, 1)[1].view(batch.label.size()) == batch.label).sum().item()
        n_total += batch.batch_size
        train_acc = 100. * n_correct/n_total

        loss = criterion(answer, batch.label)
        loss.backward()
        opt.step()
        
        # evaluate performance on validation set periodically
        if iterations % 20 == 0:
            # switch model to evaluation mode
            model.eval()
            valid_iterator.init_epoch()

            # calculate accuracy on validation set
            n_val_correct, val_loss = 0, 0
            with torch.no_grad():
                for val_batch_idx, val_batch in enumerate(valid_iterator):
                    answer = model(val_batch)
                    n_val_correct += (torch.max(answer, 1)[1].view(val_batch.label.size()) == val_batch.label).sum().item()
                    val_loss = criterion(answer, val_batch.label)
            val_acc = 100. * n_val_correct / len(val)

            print(log_template.format(time.time()-start,
                epoch, iterations, 1+batch_idx, len(train_iterator),
                100. * (1+batch_idx) / len(train_iterator), loss.item(), val_loss.item(), train_acc, val_acc))
            
        if iterations % 50 == 0:
            
            # print progress message
            print(val_log_template.format(time.time()-start,
                epoch, iterations, 1+batch_idx, len(train_iterator),
                100. * (1+batch_idx) / len(train_iterator), loss.item(), ' '*8, n_correct/n_total*100, ' '*12))
            
            acc_loss.append(loss.item(), n_correct/n_total*100)

RuntimeError: CUDA out of memory. Tried to allocate 188.00 MiB (GPU 0; 7.93 GiB total capacity; 2.24 GiB already allocated; 19.06 MiB free; 2.46 GiB reserved in total by PyTorch) (malloc at /pytorch/c10/cuda/CUDACachingAllocator.cpp:289)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x46 (0x7f24a15e4536 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x1cf1e (0x7f24a182df1e in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x1df9e (0x7f24a182ef9e in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #3: at::native::empty_cuda(c10::ArrayRef<long>, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x135 (0x7f24533cd9e5 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0xf688bb (0x7f24519b98bb in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #5: <unknown function> + 0xfb21a7 (0x7f2451a031a7 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0x1073c49 (0x7f248e2d7c49 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0x1073f87 (0x7f248e2d7f87 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #8: <unknown function> + 0xf1f1ae (0x7f24519701ae in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #9: <unknown function> + 0xf2622f (0x7f245197722f in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #10: at::native::_cudnn_rnn_backward(at::Tensor const&, c10::ArrayRef<at::Tensor>, long, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, long, long, long, bool, double, bool, bool, c10::ArrayRef<long>, at::Tensor const&, at::Tensor const&, std::array<bool, 4ul>) + 0x2c8 (0x7f245197e688 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #11: <unknown function> + 0xfb02fd (0x7f2451a012fd in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #12: <unknown function> + 0xfb1a93 (0x7f2451a02a93 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #13: <unknown function> + 0x2b081d0 (0x7f248fd6c1d0 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #14: <unknown function> + 0x2b7b623 (0x7f248fddf623 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #15: torch::autograd::generated::CudnnRnnBackward::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x708 (0x7f248fb20d18 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #16: <unknown function> + 0x2d89705 (0x7f248ffed705 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #17: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&) + 0x16f3 (0x7f248ffeaa03 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #18: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&, bool) + 0x3d2 (0x7f248ffeb7e2 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #19: torch::autograd::Engine::thread_init(int) + 0x39 (0x7f248ffe3e59 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #20: torch::autograd::python::PythonEngine::thread_init(int) + 0x38 (0x7f24a236b5f8 in /home/ndg/users/jkurre/.local/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #21: <unknown function> + 0xbd6df (0x7f24be5426df in /usr/lib/x86_64-linux-gnu/libstdc++.so.6)
frame #22: <unknown function> + 0x76db (0x7f24c38d16db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #23: clone + 0x3f (0x7f24c3c0aa3f in /lib/x86_64-linux-gnu/libc.so.6)


In [None]:
torch.save(model, params.outpath)

with open(params.outputs, "w") as output:
    output.write(str(acc_loss))