In [0]:
# References: https://medium.com/@adam.wearne/seq2seq-with-pytorch-46dc00ff5164,
# https://github.com/bentrevett/pytorch-seq2seq/blob/master/4%20-%20Packed%20Padded%20Sequences%2C%20Masking%2C%20Inference%20and%20BLEU.ipynb
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## **Load data**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# !ls "/content/drive/My Drive"
!cp "/content/drive/My Drive/dataset/squad_train.csv" "squad_train.csv"
!cp "/content/drive/My Drive/dataset/squad_dev.csv" "squad_dev.csv"
# !ls

In [0]:
# make test set & lexical features
dev = pd.read_csv('squad_dev.csv')
dev['LEX'] = ''
for idx, pos, ner, case in dev[['POS', 'NER', 'case']].itertuples():
  lex = [i + '_' + j + '_' + z for i, j, z in zip(pos.split(), ner.split(), case.split())]
  dev['LEX'][idx] = ' '.join(lex)
dev = dev[['context', 'question', 'BIO', 'LEX']]
val, test = train_test_split(dev, test_size=0.5)

In [0]:
val.to_csv('squad_val.csv', index=False)
test.to_csv('squad_test.csv', index=False)

In [0]:
train_file = pd.read_csv('squad_train.csv')
train_file['LEX'] = ''
for idx, pos, ner, case in train_file[['POS', 'NER', 'case']].itertuples():
  lex = [i + '_' + j + '_' + z for i, j, z in zip(pos.split(), ner.split(), case.split())]
  train_file['LEX'][idx] = ' '.join(lex)
train_file = train_file[['context', 'question', 'BIO', 'LEX']]

In [0]:
train_file.to_csv('squad_train.csv', index=False)

In [0]:
# Create Field object
tokenize = lambda x: x.split()
TEXT = data.Field(tokenize=tokenize, lower=False, include_lengths = True, 
                  init_token = '<sos>', eos_token = '<eos>')
LEX = data.Field(tokenize=tokenize, lower=False, init_token = '<sos>', eos_token = '<sos>')
BIO = data.Field(tokenize=tokenize, lower=False, init_token = '<sos>', eos_token = '<sos>')

# Specify Fields in our dataset
fields = [('context', TEXT), ('question', TEXT), ('bio', BIO), ('lex', LEX)]

# Build the dataset
train_data, valid_data, test_data = data.TabularDataset.splits(path = '',
                                                    train='squad_train.csv',
                                                    validation='squad_val.csv', 
                                                    test='squad_test.csv', 
                                                    fields = fields,
                                                    format='csv', 
                                                    skip_header=True)

In [9]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 130319
Number of validation examples: 5936
Number of testing examples: 5937


In [10]:
print(vars(train_data.examples[0]))

{'context': ['beyoncé', 'giselle', 'knowles', '-', 'carter', '(', '/biːˈjɒnseɪ/', 'bee', '-', 'yon', '-', 'say', ')', '(', 'born', 'september', '4', ',', '1981', ')', 'is', 'an', 'american', 'singer', ',', 'songwriter', ',', 'record', 'producer', 'and', 'actress', '.', 'born', 'and', 'raised', 'in', 'houston', ',', 'texas', ',', 'she', 'performed', 'in', 'various', 'singing', 'and', 'dancing', 'competitions', 'as', 'a', 'child', ',', 'and', 'rose', 'to', 'fame', 'in', 'the', 'late', '1990s', 's', 'lead', 'singer', 'of', 'r&b', 'girl', '-', 'group', 'destiny', "'s", 'child', '.', 'managed', 'by', 'her', 'father', ',', 'mathew', 'knowles', ',', 'the', 'group', 'became', 'one', 'of', 'the', 'world', "'s", 'best', '-', 'selling', 'girl', 'groups', 'of', 'all', 'time', '.', 'their', 'hiatus', 'saw', 'the', 'release', 'of', 'beyoncé', "'s", 'debut', 'album', ',', 'dangerously', 'in', 'love', '(', '2003', ')', ',', 'which', 'established', 'her', 'as', 'a', 'solo', 'artist', 'worldwide', ',', 

In [0]:
# Build vocabulary
MAX_VOCAB_SIZE = 50000
MIN_COUNT = 5
MAX_SEQUENCE_LENGTH = 20
BATCH_SIZE = 128

TEXT.build_vocab(train_data, 
                 max_size=MAX_VOCAB_SIZE,
                 min_freq=MIN_COUNT,
                 vectors='glove.6B.300d',
                 unk_init=torch.Tensor.normal_)

BIO.build_vocab(train_data)
LEX.build_vocab(train_data)

In [12]:

print(f"Unique tokens in vocabulary: {len(TEXT.vocab)}")
print('bio', len(BIO.vocab))
print('lex', len(LEX.vocab))

Unique tokens in vocabulary: 50004
bio 6
lex 452


In [13]:
LEX.vocab.stoi['<pad>']

1

In [14]:
BIO.vocab.stoi['<pad>']

1

In [0]:
# Create a set of iterators for each split

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     sort_within_batch = True,
     sort_key = lambda x:len(x.context),
     device = device)

## **Encoder**

In [0]:
class Encoder(nn.Module):
  
    def __init__(self, hidden_size, embedding_size,
                 embedding, answer_embedding, lexical_embedding, n_layers, dropout):
      
        super(Encoder, self).__init__()
        
        # Basic network params
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        # Embedding layer that will be shared with Decoder
        self.embedding = embedding
        self.answer_embedding = answer_embedding
        self.lexical_embedding = lexical_embedding
        
        # Bidirectional GRU
        self.gru = nn.GRU(embedding_size, hidden_size,
                          num_layers=n_layers,
                          dropout=dropout,
                          bidirectional=True)
        
    def forward(self, input_sequence, input_lengths, answer_sequence, lexical_sequence):
        
        # Convert input_sequence to word embeddings
        word_embeddings = self.embedding(input_sequence)
        answer_embeddings = self.answer_embedding(answer_sequence)
        lexical_embeddings = self.lexical_embedding(lexical_sequence)
        final_embeddings = word_embeddings+answer_embeddings+lexical_embeddings
        
        # Pack the sequence of embeddings
        packed_embeddings = nn.utils.rnn.pack_padded_sequence(final_embeddings, input_lengths)
        
        # Run the packed embeddings through the GRU, and then unpack the sequences
        outputs, hidden = self.gru(packed_embeddings)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        
        
        # The ouput of a GRU has shape (seq_len, batch, hidden_size * num_directions)
        # Because the Encoder is bidirectional, combine the results from the 
        # forward and reversed sequence by simply adding them together.
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]

        return outputs, hidden

## **Decoder**

In [0]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        
        self.hidden_size = hidden_size
        
    def dot_score(self, hidden_state, encoder_states):

        return torch.sum(hidden_state * encoder_states, dim=2)
    
    def forward(self, hidden, encoder_outputs, mask):
        attn_scores = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_scores = attn_scores.t()
        
        # Apply mask so network does not attend <pad> tokens        
        attn_scores = attn_scores.masked_fill(mask == 0, -1e10)
        
        # Return softmax over attention scores      
        return F.softmax(attn_scores, dim=1).unsqueeze(1)

In [0]:
class Decoder(nn.Module):
    def __init__(self, embedding, embedding_size,
                 hidden_size, output_size, n_layers, dropout):
        
        super(Decoder, self).__init__()
        
        # Basic network params
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = embedding
                
        self.gru = nn.GRU(embedding_size, hidden_size, n_layers, 
                          dropout=dropout)
        
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attn = Attention(hidden_size)
        
    def forward(self, current_token, hidden_state, encoder_outputs, mask):
      
        # convert current_token to word_embedding
        embedded = self.embedding(current_token)
        
        # Pass through GRU
        rnn_output, hidden_state = self.gru(embedded, hidden_state)
        
        # Calculate attention weights
        attention_weights = self.attn(rnn_output, encoder_outputs, mask)
        
        # Calculate context vector
        context = attention_weights.bmm(encoder_outputs.transpose(0, 1))
        
        # Concatenate  context vector and GRU output
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        
        # Pass concat_output to final output layer
        output = self.out(concat_output)
        
        # Return output and final hidden state
        return output, hidden_state

## **Seq2Seq**

In [0]:
class seq2seq(nn.Module):
    def __init__(self, embedding_size, hidden_size, vocab_size, 
                 device, pad_idx, eos_idx, sos_idx, teacher_forcing_ratio=0.5):
        super(seq2seq, self).__init__()
        
        # Embedding layer shared by encoder and decoder
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.answer_embedding = nn.Embedding(6, embedding_size, padding_idx=1)
        self.lexical_embedding = nn.Embedding(452, embedding_size, padding_idx=1)
        
        # Encoder network
        self.encoder = Encoder(hidden_size, 
                               embedding_size, 
                               self.embedding,
                               self.answer_embedding,
                               self.lexical_embedding,
                               n_layers=2,
                               dropout=0.5)
        
        # Decoder network        
        self.decoder = Decoder(self.embedding,
                               embedding_size,
                               hidden_size,
                               vocab_size,
                               n_layers=2,
                               dropout=0.5)
        
        
        # Indices of special tokens and hardware device 
        self.pad_idx = pad_idx
        self.eos_idx = eos_idx
        self.sos_idx = sos_idx
        self.device = device
        
    def create_mask(self, input_sequence):

        return (input_sequence != self.pad_idx).permute(1, 0)
        
    def forward(self, input_sequence, answer_sequence, lexical_sequence, output_sequence, teacher_forcing_ratio):
      
        # Unpack input_sequence tuple
        input_tokens = input_sequence[0]
        input_lengths = input_sequence[1]
      
        # Unpack output_tokens, or create an empty tensor for text generation
        if output_sequence is None:
            inference = True
            output_tokens = torch.zeros((100, input_tokens.shape[1])).long().fill_(self.sos_idx).to(self.device)
        else:
            inference = False
            output_tokens = output_sequence[0]
        
        vocab_size = self.decoder.output_size
        
        batch_size = len(input_lengths)
        max_seq_len = len(output_tokens)
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_seq_len, batch_size, vocab_size).to(self.device)
        
        # Pass through the first half of the network
        encoder_outputs, hidden = self.encoder(input_tokens, input_lengths, answer_sequence, lexical_sequence)
        
        # Ensure dim of hidden_state can be fed into Decoder
        hidden =  hidden[:self.decoder.n_layers]
        
        #first input to the decoder is the <sos> tokens
        output = output_tokens[0,:]
        
        # Create mask
        mask = self.create_mask(input_tokens)
        
        # Step through the length of the output sequence one token at a time
        # Teacher forcing is used to assist training
        for t in range(1, max_seq_len):
            output = output.unsqueeze(0)
            
            output, hidden = self.decoder(output, hidden, encoder_outputs, mask)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (output_tokens[t] if teacher_force else top1)
            
            # If we're in inference mode, keep generating until we produce an
            # <eos> token
            if inference and output.item() == self.eos_idx:
                return outputs[:t]
        
        return outputs

In [0]:
pad_idx = TEXT.vocab.stoi['<pad>']
eos_idx = TEXT.vocab.stoi['<eos>']
sos_idx = TEXT.vocab.stoi['<sos>']

# Size of embedding_dim should match the dim of pre-trained word embeddings
embedding_dim = 300
hidden_dim = 512
vocab_size = len(TEXT.vocab)
model = seq2seq(embedding_dim,
                 hidden_dim, 
                 vocab_size, 
                 device, pad_idx, eos_idx, sos_idx).to(device)
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)
model.embedding.weight.requires_grad = False
optimizer = optim.Adam([param for param in model.parameters() if param.requires_grad == True], 
                       lr=1.0e-3)
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

In [0]:
def train(model, iterator, criterion, optimizer, clip):
    # Put the model in training mode
    model.train()
    
    epoch_loss = 0
    
    for idx, batch in tqdm(enumerate(iterator), total=len(iterator)):
        
        input_sequence = batch.context
        answer_sequence = batch.bio
        output_sequence = batch.question
        lexical_sequence = batch.lex
        
        target_tokens = output_sequence[0]
        
        # zero out the gradient for the current batch
        optimizer.zero_grad()
        
        # Run the batch through our model
        output = model(input_sequence, answer_sequence, lexical_sequence, output_sequence, 0.5)
        
        # Throw it through our loss function
        output = output[1:].view(-1, output.shape[-1])
        target_tokens = target_tokens[1:].view(-1)
        
        loss = criterion(output, target_tokens)
        
        # Perform back-prop and calculate the gradient of our loss function
        loss.backward()
          
        # Clip the gradient if necessary.          
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # Update model parameters
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for idx, batch in tqdm(enumerate(iterator), total=len(iterator)):

            input_sequence = batch.context
            answer_sequence = batch.bio
            output_sequence = batch.question
            lexical_sequence = batch.lex
            
            target_tokens = output_sequence[0]
            
            output = model(input_sequence, answer_sequence, lexical_sequence, output_sequence, 0)
            
            output = output[1:].view(-1, output.shape[-1])
            target_tokens = target_tokens[1:].view(-1)

            loss = criterion(output, target_tokens)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [23]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_iterator, criterion, optimizer, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')

    print('Epoch: ', epoch)
    print('train loss: ', train_loss)
    print('valid loss: ', valid_loss)

100%|██████████| 1019/1019 [12:05<00:00,  1.26it/s]
100%|██████████| 47/47 [00:16<00:00,  2.53it/s]
  0%|          | 0/1019 [00:00<?, ?it/s]

Epoch:  0
train loss:  5.739740393229626
valid loss:  5.896215438842773


100%|██████████| 1019/1019 [12:05<00:00,  1.36it/s]
100%|██████████| 47/47 [00:16<00:00,  2.51it/s]
  0%|          | 0/1019 [00:00<?, ?it/s]

Epoch:  1
train loss:  5.070764247755775
valid loss:  5.67579921762994


100%|██████████| 1019/1019 [12:05<00:00,  1.45it/s]
100%|██████████| 47/47 [00:15<00:00,  2.55it/s]
  0%|          | 0/1019 [00:00<?, ?it/s]

Epoch:  2
train loss:  4.685238422191646
valid loss:  5.731185486976137


100%|██████████| 1019/1019 [12:04<00:00,  1.60it/s]
100%|██████████| 47/47 [00:16<00:00,  2.56it/s]
  0%|          | 0/1019 [00:00<?, ?it/s]

Epoch:  3
train loss:  4.399792373823815
valid loss:  5.681539890613962


100%|██████████| 1019/1019 [12:05<00:00,  1.50it/s]
100%|██████████| 47/47 [00:16<00:00,  2.54it/s]
  0%|          | 0/1019 [00:00<?, ?it/s]

Epoch:  4
train loss:  4.167346282842933
valid loss:  5.657527669947198


100%|██████████| 1019/1019 [12:06<00:00,  1.57it/s]
100%|██████████| 47/47 [00:16<00:00,  2.55it/s]
  0%|          | 0/1019 [00:00<?, ?it/s]

Epoch:  5
train loss:  3.9837876007297206
valid loss:  5.726515313412281


100%|██████████| 1019/1019 [12:07<00:00,  1.38it/s]
100%|██████████| 47/47 [00:15<00:00,  2.58it/s]
  0%|          | 0/1019 [00:00<?, ?it/s]

Epoch:  6
train loss:  3.8230671206447164
valid loss:  5.77486193433721


100%|██████████| 1019/1019 [12:06<00:00,  1.46it/s]
100%|██████████| 47/47 [00:16<00:00,  2.53it/s]
  0%|          | 0/1019 [00:00<?, ?it/s]

Epoch:  7
train loss:  3.697763823433409
valid loss:  5.790201430625104


100%|██████████| 1019/1019 [12:06<00:00,  1.28it/s]
100%|██████████| 47/47 [00:16<00:00,  2.55it/s]
  0%|          | 0/1019 [00:00<?, ?it/s]

Epoch:  8
train loss:  3.6003199742048597
valid loss:  5.833341395601313


100%|██████████| 1019/1019 [12:06<00:00,  1.47it/s]
100%|██████████| 47/47 [00:16<00:00,  2.55it/s]

Epoch:  9
train loss:  3.5156156209546987
valid loss:  5.882832547451588





In [31]:
# model.load_state_dict(torch.load('model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}')

100%|██████████| 47/47 [00:16<00:00,  2.42it/s]

Test Loss: 5.851





In [0]:
def translate_sentence(model, paragraph, answer_pos, lex_features):
    model.eval()
    
    tokenized = ['<sos>'] + paragraph + ['<eos>']
    numericalized = [TEXT.vocab.stoi[t] for t in tokenized] 

    tokenized_answer = ['<sos>'] + answer_pos + ['<eos>']
    numericalized_answer = [BIO.vocab.stoi[t] for t in tokenized_answer] 

    tokenized_lex = ['<sos>'] + lex_features + ['<eos>']
    numericalized_lex = [LEX.vocab.stoi[t] for t in tokenized_lex]
    
    paragraph_length = torch.LongTensor([len(numericalized)]).to(model.device) 
    tensor = torch.LongTensor(numericalized).unsqueeze(1).to(model.device) 
 
    answer_tensor = torch.LongTensor(numericalized_answer).unsqueeze(1).to(model.device) 
    lex_tensor = torch.LongTensor(numericalized_lex).unsqueeze(1).to(model.device)
    
    question_tensor_logits = model((tensor, paragraph_length), answer_tensor, lex_tensor, None, 0) 
    
    question_tensor = torch.argmax(question_tensor_logits.squeeze(1), 1)
    question = [TEXT.vocab.itos[t] for t in question_tensor]
 
    # Start at the first index.  We don't need to return the <sos> token
    question = question[1:]

    return question, question_tensor_logits

In [37]:
example_idx = 300

for i in range(200, example_idx+1):

  src = vars(train_data.examples[i])['context']
  trg = vars(train_data.examples[i])['question']
  ans = vars(train_data.examples[i])['bio']
  lex = vars(train_data.examples[i])['lex']

  print('src: ', ' '.join(src))
  print('trg: ', ' '.join(trg))

  question, logits = translate_sentence(model, src, ans, lex)
  print('predicted: ', " ".join(question))
  print()

src:  at the 52nd annual grammy awards , beyoncé received ten ominations , including album of the year for i am ... sasha fierce , record of the year for " halo " , and song of the year for " single ladies ( put a ring on it ) " , among others . she tied with lauryn hill for most grammy nominations in a single year by a female artist . in 2010 , beyoncé was featured on lady gaga 's single " telephone " and its music video . the song topped the us pop songs chart , becoming the sixth number - one for both beyoncé and gaga , tying them with mariah carey for most number - ones since the nielsen top 40 airplay chart launched in 1992 . " telephone " received a grammy award nomination for best pop collaboration with vocals .
trg:  how many awards was beyonce nominated for at the 52nd grammy awards ?
predicted:  how many awards did beyoncé receive in the 52nd grammy awards ?

src:  at the 52nd annual grammy awards , beyoncé received ten nominations , including album of the year for i am ... s

In [0]:
from nltk.translate.bleu_score import corpus_bleu

In [0]:
def calculate_bleu(data, model):
    
    trgs = []
    pred_trgs = []
    
    for datum in data:
        
        src = vars(datum)['context']
        trg = vars(datum)['question']
        ans = vars(datum)['bio']
        lex = vars(datum)['lex']
        
        pred_trg, _ = translate_sentence(model, src, ans, lex)
        
        #cut off <eos> token
        pred_trg = pred_trg[:-1]
        
        pred_trgs.append(pred_trg)
        # print(pred_trg)
        trgs.append(trg)
        # print(trg)
        
    return corpus_bleu(pred_trgs, trgs)

In [36]:
bleu_score = calculate_bleu(test_data, model)

print(f'BLEU score = {bleu_score*100:.2f}')

BLEU score = 2.16


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [0]:
 for instance in list(tqdm._instances):
    tqdm._decr_instances(instance)