In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import spacy
import numpy as np

import random
import math
import time

### Create a dataset object

In [1]:
from datasets import load_dataset

dataset = load_dataset("gigaword")
train_set = dataset['train']
val_set = dataset['validation']
test_set = dataset['test']

print(len(train_set), len(val_set), len(test_set))

Found cached dataset gigaword (/root/.cache/huggingface/datasets/gigaword/default/1.2.0/ea83a8b819190acac5f2dae011fad51dccf269a0604ec5dd24795b64efb424b6)


  0%|          | 0/3 [00:00<?, ?it/s]

### Build the data processing pipeline

In [58]:
from transformers import BertTokenizer
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import vocab
import numpy as np
from collections import Counter


# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer = get_tokenizer('basic_english')

def counter_tokens(dataset, col_name):
    counter = Counter()
    # set_name = ['train', 'validation', 'test']
    # for set in set_name:
    #     data_iter = dataset[set]
    for data in dataset:
        col = data[col_name]
        counter.update(tokenizer(col))
    return counter

# vocab = build_vocab_from_iterator(yield_tokens(dataset), specials=["<unk>"])
SRC_vocab = vocab(counter_tokens(train_set, 'document'), min_freq = 2, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))
TRG_vocab = vocab(counter_tokens(train_set, 'summary'), min_freq = 2, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))

In [59]:
print("The length of the 'document' vocab is", len(SRC_vocab))
print("The length of the 'summary' vocab is", len(TRG_vocab))
new_stoi = SRC_vocab.get_stoi()
print("The index of '<PAD>' is", new_stoi['<PAD>'])
new_itos = SRC_vocab.get_itos()
print("The token at index 1 is", new_itos[1])

The length of the 'document' vocab is 118516
The length of the 'summary' vocab is 68401
The index of '<PAD>' is 3
The token at index 1 is <BOS>


In [36]:
text_transform = lambda x: [vocab['<BOS>']] + [vocab[token] for token in tokenizer(x)] + [vocab['<EOS>']]
# trg_transform = lambda x: 1 if x == 'pos' else 0

# Print out the output of text_transform
print("input to the text_transform:", "here is an example")
print("output of the text_transform:", text_transform("here is an example"))

input to the text_transform: here is an example
output of the text_transform: [0, 334, 204, 283, 8185, 1]


### Generate batch iterator

In [51]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

train_list = list(train_set)

def collate_batch(batch):
   target_list, src_list = [], []
   for data in batch:
      _summary, _document = data['summary'], data['document']
      target_text = torch.tensor(text_transform(_summary))
      src_text = torch.tensor(text_transform(_document))
      target_list.append(target_text)
      src_list.append(src_text)
   return pad_sequence(target_list, padding_value=3), pad_sequence(src_list, padding_value=3)

# train_dataloader = DataLoader(train_list, batch_size=8, shuffle=True, 
#                               collate_fn=collate_batch)

In [None]:
import random

batch_size = 8  # A batch size of 8

def batch_sampler():
    indices = [(i, len(tokenizer(s['document']))) for i, s in enumerate(train_list)]
    random.shuffle(indices)
    pooled_indices = []
    # create pool of indices with similar lengths 
    for i in range(0, len(indices), batch_size * 100):
        pooled_indices.extend(sorted(indices[i:i + batch_size * 100], key=lambda x: x[1]))

    pooled_indices = [x[0] for x in pooled_indices]

    # yield indices for current batch
    for i in range(0, len(pooled_indices), batch_size):
        yield pooled_indices[i:i + batch_size]

bucket_dataloader = DataLoader(train_list, batch_sampler=batch_sampler(),
                               collate_fn=collate_batch)

### Building the Seq2Seq Model

In [54]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, num_layers=3, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        #embedded = [src len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded)
        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        #outputs = [src len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden

In [55]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)
        
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)

In [56]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim, num_layers=3, bidirectional = True)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs)
                
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0)

In [57]:

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.0):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [None]:
# Define hyperparameters
input_vocab_size = len(vocab)
target_vocab_size = 8000
embedding_dim = 256
hidden_size = 512

# Create encoder and decoder instances
encoder = Encoder(input_vocab_size, embedding_dim, hidden_size)
decoder = Decoder(target_vocab_size, embedding_dim, hidden_size)

# Create the Seq2Seq model
model = Seq2Seq(encoder, decoder)

In [None]:
import torch
import torch.nn as nn


class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.hidden_size = hidden_size
        self.W = nn.Linear(hidden_size, hidden_size, bias=False)
        self.V = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, decoder_hidden, encoder_outputs):
        decoder_hidden = decoder_hidden.unsqueeze(1)
        energy = torch.tanh(self.W(decoder_hidden + encoder_outputs))
        attention_scores = self.V(energy).squeeze(2)
        attention_weights = torch.softmax(attention_scores, dim=1)
        context_vector = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)
        return context_vector, attention_weights

class Encoder(nn.Module):
    def __init__(self, input_vocab_size, embedding_dim, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers=3, bidirectional=True)
        # self.h0 = nn.Parameter(torch.zeros(6, 1, hidden_size))

    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        outputs, (hidden, cell) = self.gru(embedded)
        return outputs, hidden, cell

class Decoder(nn.Module):
    def __init__(self, target_vocab_size, embedding_dim, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(target_vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim + 2 * hidden_size, hidden_size, num_layers=3, bidirectional=True)
        self.attention = BahdanauAttention(hidden_size)
        self.fc = nn.Linear(hidden_size, target_vocab_size)

    def forward(self, input_seq, hidden, cell, encoder_outputs):
        embedded = self.embedding(input_seq)
        context_vector, attention_weights = self.attention(hidden[-1], encoder_outputs)
        gru_input = torch.cat((embedded, context_vector), dim=2)
        outputs, (hidden, cell) = self.gru(gru_input, (hidden, cell))
        predictions = self.fc(outputs.squeeze(0))
        return predictions, hidden, cell, attention_weights

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, target_seq):
        encoder_outputs, hidden, cell = self.encoder(input_seq)
        decoder_outputs, _, _, _ = self.decoder(target_seq, hidden, cell, encoder_outputs)
        return decoder_outputs

# Define hyperparameters
input_vocab_size = 10000
target_vocab_size = 8000
embedding_dim = 256
hidden_size = 512

# Create encoder and decoder instances
encoder = Encoder(input_vocab_size, embedding_dim, hidden_size)
decoder = Decoder(target_vocab_size, embedding_dim, hidden_size)

# Create the Seq2Seq model
model = Seq2Seq(encoder, decoder)

In [None]:
class GigawordDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataset
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data[index]
        text = data['document']
        summary = data['summary']

        inputs = self.tokenizer.encode_plus(
            text,
            summary,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'summary_ids': inputs['labels'].flatten()
        }
    
train_dataset = GigawordDataset(train_set, tokenizer, 512)