In [135]:
import torchtext
import torch
import random
import pandas as pd

import torch.nn.functional as F
from torch.autograd import Variable
from torchtext import data
from torch import nn

import spacy
import math

### Create Dataset

In [71]:
mnli_path = 'utils/multinli_1.0/multinli_1.0_train.jsonl'
labels = {'contradiction':1, 'entailment':2, 'neutral':3}

In [None]:
df = load_data(mnli_path)
df["sentence"] = df["sentence1"] + df["sentence2"]
df["gold_label"] = df["gold_label"].apply(lambda x: labels[x])

train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

train.to_csv("dataset/train.csv")
validate.to_csv("dataset/val.csv")
test.to_csv("dataset/test.csv")

### Load Dataset

In [73]:
train_path = "mnli.csv"
glove_path = 'utils/glove.6B/glove.6B.50d.txt'

source = data.Field(
        tokenize = 'spacy'
        , lower = True
        , batch_first = True
        )

target = data.Field(
        sequential=False
        , use_vocab = False
        , is_target=True
        )

train_data, valid_data, test_data = data.TabularDataset.splits(
    path = 'dataset/'
    , train = 'train.csv'
    , validation = 'val.csv'
    , test = 'test.csv'
    , format = 'csv'
    , fields = {'sentence': ('text', source), 'gold_label': ('target', target)}
)

source.build_vocab(train_data, min_freq=2)
source.vocab.load_vectors(torchtext.vocab.Vectors(glove_path))

print(source.vocab.vectors.shape)
print(f"Unique tokens in text vocabulary: {len(source.vocab)}")

torch.Size([69781, 50])
Unique tokens in text vocabulary: 69781


### Seq2One Model

In [179]:
"""

Tutorials

# https://www.youtube.com/watch?v=EoGUlvhRYpk&list=PLhhyoLH6IjfxeoooqP9rhU3HJIAVAJ3Vz&index=26
# https://www.youtube.com/watch?v=sQUqQddQtB4

# https://github.com/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb
# https://pytorch.org/tutorials/beginner/transformer_tutorial.html
# https://github.com/keon/seq2seq/blob/master/model.py
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

"""

# hidden size in encoder and decoder is the same
# output size is the length of our English vocab

class Encoder(nn.Module):
    def __init__(self, input_size, embed_size,
                 hidden_size, n_layers, dropout):
        
        super(Encoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.dropout = nn.Dropout(dropout)
        self.embed = nn.Embedding(input_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size,
                            num_layers=n_layers, dropout=dropout)
        
    def forward(self, x):
        # x: (seq_length, N) where N is the batch size
        x = x.transpose(0, 1)
        embedded = self.dropout(self.embed(x))
        # embedded: (seq_length, N, embed_size)
        encoder_outputs, (hidden, cell) = self.lstm(embedded)
        return encoder_outputs, hidden, cell
    
class Attention(nn.Module):
    
    """Taken from Keon"""
    
    def __init__(self, hidden_size):
        
        super(Attention, self).__init__()
        
        self.hidden_size = hidden_size
        self.attention = nn.Linear(self.hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.uniform_(-stdv, stdv)

    def forward(self, hidden, encoder_outputs):
        timestep = encoder_outputs.size(0)
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        encoder_outputs = encoder_outputs.transpose(0, 1)  # [B*T*H]
        attention_energies = self.score(h, encoder_outputs)
        return F.softmax(attention_energies, dim=1).unsqueeze(1)

    def score(self, hidden, encoder_outputs):
        # [B*T*2H]->[B*T*H]
        print(hidden.shape, encoder_outputs.shape)
        energy = F.relu(self.attention(torch.cat([hidden, encoder_outputs], 2)))
        energy = energy.transpose(1, 2)  # [B*H*T]
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)  # [B*1*H]
        energy = torch.bmm(v, energy)  # [B*1*T]
        return energy.squeeze(1)  # [B*T]
    
class Decoder(nn.Module):
    def __init__(self, input_size, embed_size,
                 hidden_size, output_size, n_layers, dropout):
        
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.dropout = nn.Dropout(dropout)
        self.embed = nn.Embedding(embed_size, embed_size)
        self.attention = Attention(hidden_size)
        self.lstm = nn.LSTM(embed_size + hidden_size, hidden_size,
                            num_layers=n_layers, dropout=dropout)
        
        self.fc = nn.Linear(2*hidden_size, output_size)
        
    def forward(self, x, encoder_outputs, hidden, cell):
        # x: (N) -> (1, N) add dimension
          
        embedded = self.embed(x.unsqueeze(0))
        embedded = self.dropout(embedded)
        
        # attention weights and encoder output
        weights = self.attention(hidden[-1], encoder_outputs)
        context = weights.bmm(encoder_outputs.transpose(0, 1))
        context = context.transpose(0, 1)
        
        # combine input with context and pass to LSTM
        to_lstm = torch.cat([embedded, context], 2)
        outputs, hidden = self.lstm(to_lstm, (hidden, cell))
        
        # remove dimension
        outputs = outputs.squeeze(0) 
        context = context.squeeze(0)
        
        # get probabilities of occurence
        outputs = self.fc(torch.cat([outputs, context], 1))
        outputs = F.log_softmax(outputs, dim=1)

        return outputs, hidden, weights
        
class Seq2One(nn.Module):
    def __init__(self, encoder, decoder):
        
        super(Seq2One, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target):
        # no teacher forcing
        batch_size = target.shape[0]
        enc_outputs, hidden, cell = self.encoder(source)
        output, hidden, weights = self.decoder(target, enc_outputs, hidden, cell)
        return output
        
"""Hyperparameters"""
# training
num_epochs = 20
learning_rate = 0.001
batch_size = 128

# model hyperparameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size = len(source.vocab)
output_size = len(labels)
embed_size = 50
hidden_size = 1024 # 2014 benchmark; slightly small
num_layers = 2 # benchmark did 4
enc_dropout = 0.5
dec_dropout = 0.5

# define iterator
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = batch_size,
     sort_within_batch = True,
     sort_key = lambda x: len(x.text), # minimize padding
     device = device)

encoder = Encoder(input_size, embed_size, hidden_size,
                  n_layers, enc_dropout).to(device)

decoder = Decoder(input_size, embed_size, hidden_size,
                  output_size, n_layers, dec_dropout).to(device)

model = Seq2One(encoder, decoder).to(device)

pad_idx = source.vocab.stoi["<pad"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    print(f"Epoch {epoch} of {num_epochs}")
    
    for batch_idx, batch in enumerate(train_iterator):
        input_data = batch.text.to(device)
        target_data = batch.target.to(device)

        output = model(input_data, target_data)
        # output: (target_length, batch_size, output_dim)

        # the first output is the start token; ignore
        output = output[1:].reshape(-1, output.shape[2])
        target_data = target_data.reshape(-1)
        
        optimizer.zero_grad()
        loss = criterion(output, target)
        
        # address gradient issue
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

Epoch 0 of 20
torch.Size([128, 21, 1024]) torch.Size([128, 21, 1024])


IndexError: tuple index out of range

### biLSTM + Attention

Next Steps: 
    - Bag of words model, remove duplicate words 