In [1]:
import torchtext
import torch
import pandas as pd

from torch.autograd import Variable
import torch.nn.functional as F
from torchtext import data
from torch import nn
import random

import spacy
import math

import sys
sys.path.append('..')

from utils.load_data import load_mnli

### Create Dataset

In [2]:
load_mnli()

ValueError: Expected object or value

### Load Dataset

In [12]:


source = data.Field(
        tokenize = 'spacy'
        , lower = True
        , batch_first = True
        )

target = data.Field(
        sequential=False
        , use_vocab = False
        , is_target=True
        )

train_data, valid_data, test_data = data.TabularDataset.splits(
    path = 'dataset/'
    , train = 'train.csv'
    , validation = 'val.csv'
    , test = 'test.csv'
    , format = 'csv'
    , fields = {'sentence': ('text', source), 'gold_label': ('target', target)}
)

source.build_vocab(train_data, min_freq=2)
source.vocab.load_vectors(torchtext.vocab.Vectors(glove_path, cache="."))

print(source.vocab.vectors.shape)
print(f"Unique tokens in text vocabulary: {len(source.vocab)}")

KeyboardInterrupt: 

### Build Model

In [6]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 4, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.uniform_(-stdv, stdv)

    def forward(self, hidden, encoder_outputs):
        hidden = hidden.reshape((1, hidden.shape[1], hidden.shape[2] * 2))
        timestep = encoder_outputs.size(0)
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        encoder_outputs = encoder_outputs.transpose(0, 1)  # [B*T*H]
        attn_energies = self.score(h, encoder_outputs)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

    def score(self, hidden, encoder_outputs):
        # [B*T*2H]->[B*T*H]
        catted = torch.cat([hidden, encoder_outputs], 2)
        energy = F.relu(self.attn(catted))
        energy = energy.transpose(1, 2)  # [B*H*T]
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)  # [B*1*H]
        energy = torch.bmm(v, energy)  # [B*1*T]
        return energy.squeeze(1)  # [B*T]
    
class Seq2One(nn.Module):
    def __init__(self, input_size, output_size, embed_size,
                 hidden_size, n_layers, dropout):
        
        super(Seq2One, self).__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.dropout = nn.Dropout(dropout)
        self.embed = nn.Embedding(input_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size,
                            num_layers=n_layers, dropout=dropout, 
                            bidirectional=True)
        self.attention = Attention(hidden_size)
        self.fc = nn.Linear(self.hidden_size * 2, output_size)
        
    def forward(self, x):
        # x: (seq_length, N) where N is the batch size
        embedded = self.dropout(self.embed(x.transpose(0,1)))
        # embedded: (seq_length, N, embed_size)
        outputs, (hidden, cell) = self.lstm(embedded)
        
        weights = self.attention(hidden[-2:], outputs)
        
        context = weights.bmm(outputs.transpose(0, 1))
        context = context.transpose(0, 1)
        
        context = context.squeeze(0)
        output = self.fc(context)
        output = F.log_softmax(output, dim=1)
        
        return output

In [7]:
"""Hyperparameters"""
# training
num_epochs = 20
learning_rate = 0.001
batch_size = 128

# model hyperparameters
load_model = False
device = 'cpu' # torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size = len(source.vocab)
output_size = len(labels)
embed_size = 50
hidden_size = 1024 # 2014 benchmark; slightly small
num_layers = 2 # benchmark did 4
dropout = 0.5

# define iterator
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = batch_size,
     sort_within_batch = True,
     sort_key = lambda x: len(x.text), # minimize padding
     device = device)


model = Seq2One(input_size, output_size, embed_size, hidden_size,
                  num_layers, dropout).to(device)


pad_idx = source.vocab.stoi["<pad"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# simple cross entropy cost (might be numerically unstable if pred has 0)
# https://discuss.pytorch.org/t/cross-entropy-with-one-hot-targets/13580/6
def xentropy_cost(x_target, log_x_pred):
    """Cross Entropy for One Hot Encoded Targets"""
    assert x_target.size() == log_x_pred.size(), "size fail ! " + str(x_target.size()) + " " + str(log_x_pred.size())
    return -torch.sum(x_target * log_x_pred)

In [8]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch} of {num_epochs}")
    
    for batch_idx, batch in enumerate(train_iterator):
        input_data = batch.text.to(device)
        target_data = batch.target.to(device)
        target_data_one_hot = torch.nn.functional.one_hot(target_data - 1, len(labels))
        
        output = model(input_data)
        
        optimizer.zero_grad()
        loss = xentropy_cost(target_data_one_hot, output)
        print("XEntropy Loss:", loss)
        
        # address gradient issue
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

Epoch 0 of 20
XEntropy Loss: tensor(140.6231, grad_fn=<NegBackward>)
XEntropy Loss: tensor(139.9834, grad_fn=<NegBackward>)
XEntropy Loss: tensor(142.2542, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.1848, grad_fn=<NegBackward>)
XEntropy Loss: tensor(139.6089, grad_fn=<NegBackward>)
XEntropy Loss: tensor(144.6628, grad_fn=<NegBackward>)
XEntropy Loss: tensor(139.3262, grad_fn=<NegBackward>)
XEntropy Loss: tensor(141.3078, grad_fn=<NegBackward>)
XEntropy Loss: tensor(142.9890, grad_fn=<NegBackward>)
XEntropy Loss: tensor(144.2816, grad_fn=<NegBackward>)
XEntropy Loss: tensor(141.6668, grad_fn=<NegBackward>)
XEntropy Loss: tensor(141.3825, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.8075, grad_fn=<NegBackward>)
XEntropy Loss: tensor(143.1952, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.0742, grad_fn=<NegBackward>)
XEntropy Loss: tensor(142.4247, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.3235, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.0714, grad_fn=<NegBackwar

XEntropy Loss: tensor(140.3765, grad_fn=<NegBackward>)
XEntropy Loss: tensor(141.3184, grad_fn=<NegBackward>)
XEntropy Loss: tensor(142.3539, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.2982, grad_fn=<NegBackward>)
XEntropy Loss: tensor(141.0925, grad_fn=<NegBackward>)
XEntropy Loss: tensor(141.5884, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.5423, grad_fn=<NegBackward>)
XEntropy Loss: tensor(139.4417, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.7161, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.2836, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.8852, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.5652, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.2905, grad_fn=<NegBackward>)
XEntropy Loss: tensor(141.0388, grad_fn=<NegBackward>)
XEntropy Loss: tensor(142.6584, grad_fn=<NegBackward>)
XEntropy Loss: tensor(139.6019, grad_fn=<NegBackward>)
XEntropy Loss: tensor(141.2359, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.8884, grad_fn=<NegBackward>)
XEntropy L

XEntropy Loss: tensor(140.5264, grad_fn=<NegBackward>)
XEntropy Loss: tensor(136.2075, grad_fn=<NegBackward>)
XEntropy Loss: tensor(139.7768, grad_fn=<NegBackward>)
XEntropy Loss: tensor(138.9957, grad_fn=<NegBackward>)
XEntropy Loss: tensor(141.5524, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.0874, grad_fn=<NegBackward>)
XEntropy Loss: tensor(141.1110, grad_fn=<NegBackward>)
XEntropy Loss: tensor(139.7516, grad_fn=<NegBackward>)
XEntropy Loss: tensor(137.6921, grad_fn=<NegBackward>)
XEntropy Loss: tensor(139.0570, grad_fn=<NegBackward>)
XEntropy Loss: tensor(139.6021, grad_fn=<NegBackward>)
XEntropy Loss: tensor(140.8333, grad_fn=<NegBackward>)
XEntropy Loss: tensor(138.7322, grad_fn=<NegBackward>)
XEntropy Loss: tensor(136.3794, grad_fn=<NegBackward>)
XEntropy Loss: tensor(137.1741, grad_fn=<NegBackward>)
XEntropy Loss: tensor(137.4207, grad_fn=<NegBackward>)
XEntropy Loss: tensor(136.5648, grad_fn=<NegBackward>)
XEntropy Loss: tensor(137.2680, grad_fn=<NegBackward>)
XEntropy L

KeyboardInterrupt: 