For a very long time, I've been fascinated by sequence-to-sequence models. Give the model a photo as input, it spits out a caption to go along with it; give it some English text, it can translate it into another language. Seq2seq models are also not only widely applicable in different contexts, but it is also the basis of other more advanced models that came after, namely attention. 

In [25]:
import random
import time

import torch
import torchtext
from torch import nn
from torchtext.data import BucketIterator, Field
from torchtext.datasets import Multi30k

In [26]:
SRC = Field(
    tokenize="spacy",
    tokenizer_language="de",
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
)

TRG = Field(
    tokenize="spacy",
    tokenizer_language="en",
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
)

In [3]:
train_data, validation_data, test_data = Multi30k.splits(
    root="data", exts=(".de", ".en"), fields=(SRC, TRG)
)

In [28]:
for data in (train_data, validation_data, test_data):
    print(len(data))

29000
1014
1000


In [29]:
print(vars(train_data.examples[0]))

{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [30]:
SRC.build_vocab(train_data, max_size=10000, min_freq=2)
TRG.build_vocab(train_data, max_size=10000, min_freq=2)

In [31]:
print(TRG.init_token, TRG.eos_token, TRG.pad_token)

<sos> <eos> <pad>


In [32]:
print(f"Unique tokens in source (ger) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (eng) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (ger) vocabulary: 7854
Unique tokens in target (eng) vocabulary: 5893


In [33]:
SRC.vocab.stoi["<sos>"]

2

In [34]:
BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, validation_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data), batch_size=BATCH_SIZE, device=device
)

In [35]:
for batch in train_iterator:
    print(batch.src[0])
    print(batch.src[1].shape)
    break

tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2])
torch.Size([128])


In [12]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers, dropout=dropout)
    
    def forward(self, x):
        # x.shape == (src_seq_len, batch_size) == (36, 128)
        embedding = self.dropout(self.embed(x))
        # embedding.shape == (36, 128, embed_dim) == (36, 128, 256)
        outputs, (hidden, cell) = self.lstm(embedding)
        # outputs.shape == (36, 128, hidden_size) == (36, 128, 512)
        # hidden.shape, cell.shape == (2, 128, hidden_size) == (2, 128, 512)
        return hidden, cell

In [13]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers, dropout):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.dropout = nn.Dropout(dropout)
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, hidden, cell):
        # x.shape == (128,)
        x = x.unsqueeze(0)
        # x.shape == (1, 128)
        embedding = self.dropout(self.embed(x))
        # embedding.shape = (1, 128, 256)
        outputs, (hidden, cell) = self.lstm(embedding, (hidden, cell))
        # outputs.shape == (1, 128, 512)
        outputs = outputs.squeeze(0)
        # outputs.shape == (128, 512)
        predictions = self.fc(outputs)
        # predictions.shape == (128, vocab_size)
        return predictions, hidden, cell

In [14]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, source, target, teacher_force_ratio=0.5):
        seq_len = target.size(0)
        batch_size = target.size(1)    
        outputs = torch.zeros(
            seq_len, batch_size, self.decoder.vocab_size
        ).to(self.device)
        
        hidden, cell = self.encoder(source)
        x = target[0]
        # x.shape == (128,)
        
        for t in range(1, seq_len):
            predictions, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = predictions
            teacher_force = random.random() < teacher_force_ratio
            if teacher_force:
                x = predictions.argmax(1)
            else:
                x = target[t]
                
        return outputs

In [15]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(encoder, decoder, device).to(device)

In [16]:
def init_weights(model):
    for name, param in model.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embed): Embedding(7854, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embed): Embedding(5893, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=512, out_features=5893, bias=True)
  )
)

In [17]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

13898757

In [18]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
optimizer = torch.optim.Adam(model.parameters())

In [19]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        source = batch.src
        target = batch.trg
        # source.shape == (batch_seq_len, 128)
        # target.shape == (batch_seq_len, 128)
        output = model(source, target)
        # output.shape == (batch_seq_len, batch_size, vocab_size) == (25, 128, 5893)
        
        output = output[1:].reshape(-1, output.size(2))
        # output.shape == (3072, 5893)
        target = target[1:].reshape(-1)
        # target.shape == (3072)
        
        optimizer.zero_grad()
        loss = criterion(output, target)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    
    return epoch_loss / (i + 1)

In [36]:
CLIP = 1
N_EPOCHS = 10
best_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    validation_loss = evaluate(model, validation_iterator, criterion)
    end_time = time.time()
    total_time = end_time - start_time
    
    if validation_loss < best_loss:
        best_loss = validation_loss
        torch.save(model.state_dict(), './data/seq2seq/weights.pt')
    
    print(
        f"Epoch [{epoch+1}{N_EPOCHS}], Time: {total_time}s, "
        f"Train Loss: {train_loss:.3f}, Val. Loss: {validation_loss:.3f}"
    )

In [37]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            source = batch.src
            target = batch.trg
            output = model(source, target, teacher_force_ratio=0)
            target = target[1:].reshape(-1)
            output = output[1:].reshape(-1, output.size(2))
            loss = criterion(output, target)
            epoch_loss += loss.item()
    model.train()
    return epoch_loss / (i + 1)

https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb

https://pytorch.org/tutorials/beginner/torchtext_translation_tutorial.html

https://github.com/SethHWeidman/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb

https://www.youtube.com/watch?v=EoGUlvhRYpk&list=PLhhyoLH6IjfxeoooqP9rhU3HJIAVAJ3Vz&index=33