In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

In [16]:
spacy_ger = spacy.load('de')
spacy_eng = spacy.load('en')

def tokenizer_ger(text):
    return [token.text for token in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
    return[token.text for token in spacy_eng.tokenizer(text)]

german = Field(tokenize=tokenizer_ger, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=tokenizer_eng, lower=True, init_token='<sos>', eos_token='<eos>')

train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                        fields=(german, english))

german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [17]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout_layer):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        
        self.dropout = nn.Dropout(dropout_layer)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_layer)
        
    def forward(self, x):
        # x shape: (seq_length, N)
        embedding = self.dropout(self.embedding(x))
        
        # embedding shape: (seq_length, N, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding)
        
        return hidden, cell
        

In [28]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size,
                num_layers, dropout_layer):
        
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout = nn.Dropout(dropout_layer)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_layer)
        self.fc = nn.Linear(hidden_size, output_size)
    
    #Note here that we are also passing in the hidden and cell state.
    def forward(self, x, hidden, cell):
        # shape of x: (1, N, embedding_size)
        #The '1' is because we are sending in a word at a time through the decoder.
        #So, to add the '1', we can unsqueeze the array.
        x = x.unsqueeze(0)
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, hidden_size)
        
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        #shape of outputs: (1, N, hidden_size)
        
        predictions = self.fc(outputs)
        # shape predictions: (1, N, length_of_vocab)
        #When sending it to the fully connected layer, we don't need the '1'
        #therefore, we can squeeze it to get rid of the '1'.
        predictions = predictions.squeeze(0)
        
        return predictions, hidden, cell

In [29]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)
        
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        
        hidden, cell = self.encoder(source)
        
        #Grab the start token.
        x = target[0]
        
        #The teacher_force_ratio is applicable to the decoder.
        #If you vizualize the decoder, the output here is a sequence of english words.
        #The output from one node is going to be the input to the the next.
        #However, if the output is the wrong word, this means the input to the next node is wrong as well.
        #What we can do to prevent this is feed input from the target sentence into the decoder time to time.
        #So 50% of the time, input will be words from the target sequence.
        #If it is more, this would then not train the model properly as it is given all the answers.
        
        #In this for loop, we can see that the parameter to the decoder, 'x', is sometimes target
        #and sometimes it is the the word from output.
        #We start however with the first word in the target i.e. start token.
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            
            outputs[t] = output
            
            best_guess = output.argmax(1)
            
            x = target[t] if random.random() < teacher_force_ratio else best_guess
        
        return outputs

In [36]:
#Training hyperparameters
num_epochs = 1
learning_rate = 0.001
batch_size = 64

#Model hyperparameters
load_model = False
device = torch.device('cpu')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

#Tensorboard
writer = SummaryWriter(f'runs/loss_plot')
step = 0

In [43]:
#The sort_within_batch and sort_key TRIES to make sure that the batches contain words of equal
#length so that it doesn't waste too much compute in padding.
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key= lambda x: len(x.src),
    device=device)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size,
                     num_layers, enc_dropout).to(device)

decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, 
                     output_size, num_layers, dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

for epoch in range(num_epochs):
    print(f'Epoch [{epoch} / {num_epochs}]')
    
    for batch_idx, batch in tqdm(enumerate(train_iterator)):
        input_data = batch.src.to(device)
        target = batch.trg.to(device)
        
        output = model(input_data, target)
        #output shape: (target_len, batch_size, output_dim)
        
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        
        optimizer.zero_grad()
        loss = criterion(output, target)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        
        writer.add_scalar('Training loss', loss, global_step=step)
        step += 1
    

Epoch [0 / 1]


TypeError: 'BucketIterator' object is not subscriptable