# Imports

In [2]:
import torch
import torch.nn as nn
from torchvision import transforms
from matplotlib import pyplot as plt
import torchvision
from torch.nn import functional as F
import re
import collections
import math
from torch.utils.data import DataLoader

# LSTM

## Helper Functions

In [3]:

def preprocess(text):
    return re.sub('[^A-Za-z]+', ' ', text).lower()

text = ""
full_text = ""
with open('The Time Machine - Sample.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()
    full_text = preprocess(raw_text)

In [4]:
class Vocab:
    """Vocabulary for text."""
    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        """Defined in :numref:`sec_text-sequence`"""
        # Flatten a 2D list if needed
        if tokens and isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        # Count token frequencies
        counter = collections.Counter(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        # The list of unique tokens
        self.idx_to_token = list(sorted(set(['<unk>'] + reserved_tokens + [
            token for token, freq in self.token_freqs if freq >= min_freq])))
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if hasattr(indices, '__len__') and len(indices) > 1:
            return [self.idx_to_token[int(index)] for index in indices]
        return self.idx_to_token[indices]

    @property
    def unk(self):  # Index for the unknown token
        return self.token_to_idx['<unk>']
    
def tokenize(text):
    return list(text)

def build(raw_text, vocab=None):
    tokens = tokenize(preprocess(raw_text))
    if vocab is None: vocab = Vocab(tokens)
    corpus = [vocab[token] for token in tokens]
    return corpus, vocab


In [5]:
class LanguageData(nn.Module):
    def _download(self):
        fname = "The Time Machine - Sample.txt"
        with open(fname, encoding='utf-8') as f:
            return f.read()

    def _preprocess(self, text):
        """Defined in :numref:`sec_text-sequence`"""
        return re.sub('[^A-Za-z]+', ' ', text).lower()

    def _tokenize(self, text):
        """Defined in :numref:`sec_text-sequence`"""
        return list(text)

    def build(self, raw_text, vocab=None):
        """Defined in :numref:`sec_text-sequence`"""
        tokens = self._tokenize(self._preprocess(raw_text))
        if vocab is None: vocab = Vocab(tokens)
        corpus = [vocab[token] for token in tokens]
        return corpus, vocab

    def __init__(self, batch_size, num_steps, num_train=10000, num_val=5000):
        """Defined in :numref:`sec_language-model`"""
        super().__init__()
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.num_train = num_train
        self.num_val = num_val
        corpus, self.vocab = self.build(self._download())
        array = torch.tensor([corpus[i:i+num_steps+1]
                            for i in range(len(corpus)-num_steps)])
        self.X, self.Y = array[:,:-1], array[:,1:]

    def get_dataloader(self, train):
        """Defined in :numref:`subsec_partitioning-seqs`"""
        idx = slice(0, self.num_train) if train else slice(
            self.num_train, self.num_train + self.num_val)
        return self.get_tensorloader([self.X, self.Y], train, idx)
    
    def get_tensorloader(self, tensors, train, idx_slice):
        dataset = torch.utils.data.TensorDataset(
            tensors[0][idx_slice], tensors[1][idx_slice]
        )
        return torch.utils.data.DataLoader(
            dataset, batch_size=self.batch_size, shuffle=train
        )

In [6]:
class RNNScratch(nn.Module):
    """The RNN model implemented from scratch.

    Defined in :numref:`sec_rnn-scratch`"""
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.sigma = sigma
        self.W_xh = nn.Parameter(
            torch.randn(num_inputs, num_hiddens) * sigma)
        self.W_hh = nn.Parameter(
            torch.randn(num_hiddens, num_hiddens) * sigma)
        self.b_h = nn.Parameter(torch.zeros(num_hiddens))

    def forward(self, inputs, state=None):
        """Defined in :numref:`sec_rnn-scratch`"""
        if state is None:
            # Initial state with shape: (batch_size, num_hiddens)
            state = torch.zeros((inputs.shape[1], self.num_hiddens),
                              device=inputs.device)
        else:
            state, = state
        outputs = []
        for X in inputs:  # Shape of inputs: (num_steps, batch_size, num_inputs)
            state = torch.tanh(torch.matmul(X, self.W_xh) +
                             torch.matmul(state, self.W_hh) + self.b_h)
            outputs.append(state)
        return outputs, state

In [7]:
class RNNLMScratch(nn.Module): 
    """The RNN-based language model implemented from scratch."""
    def __init__(self, rnn, vocab_size, lr=0.01):
        super().__init__()
        self.rnn = rnn
        self.vocab_size = vocab_size
        self.lr = lr
        self.init_params()

    def init_params(self):
        self.W_hq = nn.Parameter(
            torch.randn(
                self.rnn.num_hiddens, self.vocab_size) * self.rnn.sigma)
        self.b_q = nn.Parameter(torch.zeros(self.vocab_size))

    def training_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        self.plot('ppl', torch.exp(l), train=True)
        return l

    def validation_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        self.plot('ppl', torch.exp(l), train=False)
    
    def one_hot(self, X):
        # Output shape: (num_steps, batch_size, vocab_size)
        return F.one_hot(X.T, self.vocab_size).type(torch.float32)
    
    def output_layer(self, rnn_outputs):
        outputs = [torch.matmul(H, self.W_hq) + self.b_q for H in rnn_outputs]
        return torch.stack(outputs, 1)
    
    
    
    def clip_gradients(self, grad_clip_val, model):
        params = [p for p in model.parameters() if p.requires_grad]
        norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
        if norm > grad_clip_val:
            for param in params:
                param.grad[:] *= grad_clip_val / norm

    def forward(self, X, state=None):
        """Defined in :numref:`sec_rnn-scratch`"""
        embs = self.one_hot(X)
        rnn_outputs, _ = self.rnn(embs, state)
        return self.output_layer(rnn_outputs)

    def predict(self, prefix, num_preds, vocab, device=None):
        """Defined in :numref:`sec_rnn-scratch`"""
        state, outputs = None, [vocab[prefix[0]]]
        for i in range(len(prefix) + num_preds - 1):
            X = torch.tensor([[outputs[-1]]], device=device)
            embs = self.one_hot(X)
            rnn_outputs, state = self.rnn(embs, state)
            if i < len(prefix) - 1:  # Warm-up period
                outputs.append(vocab[prefix[i + 1]])
            else:  # Predict num_preds steps
                Y = self.output_layer(rnn_outputs)
                outputs.append(int(torch.reshape(torch.argmax(Y, axis=2), (1,))))
        return ''.join([vocab.idx_to_token[i] for i in outputs])


In [8]:
class RNN(nn.Module):  #@save
    """The RNN model implemented with high-level APIs."""
    def __init__(self, num_inputs, num_hiddens):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        # Initialize the RNN layer 
        self.rnn = nn.RNN(num_inputs, num_hiddens)

    def forward(self, inputs, H=None):
        return self.rnn(inputs, H)

In [9]:
class RNNLM(RNNLMScratch):  #@save
    """The RNN-based language model implemented with high-level APIs."""
    def init_params(self):
        self.linear = nn.LazyLinear(self.vocab_size)

    def output_layer(self, hiddens):
        return self.linear(hiddens).swapaxes(0, 1)


In [10]:
def train(model, data, criterion, optimizer, num_epochs=100, grad_clip=1):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Training on {device}")
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for X, Y in data.get_dataloader(train=True):
            X, Y = X.to(device), Y.to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs.reshape(-1, outputs.shape[-1]), Y.reshape(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip)
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(data.get_dataloader(train=True))

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_val, Y_val in data.get_dataloader(train=False):
                X_val, Y_val = X_val.to(device), Y_val.to(device)
                val_outputs = model(X_val)
                v_loss = criterion(val_outputs.reshape(-1, val_outputs.shape[-1]), Y_val.reshape(-1))
                val_loss += v_loss.item()
        avg_val_loss = val_loss / len(data.get_dataloader(train=False))

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}, Train Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

In [11]:
class DataModule:
    def __init__(self, root='../data'):
        self.root = root

    def get_dataloader(self, train):
        raise NotImplementedError

    def train_dataloader(self):
        return self.get_dataloader(train=True)

    def val_dataloader(self):
        return self.get_dataloader(train=False)

    def get_tensorloader(self, tensors, train, indices=slice(0, None)):
        tensors = tuple(a[indices] for a in tensors)
        dataset = torch.utils.data.TensorDataset(*tensors)
        return torch.utils.data.DataLoader(dataset, self.batch_size,
                                           shuffle=train)

## Implementation From Scratch

In [12]:
class LSTMScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens  
        self.sigma = sigma

        init_weight = lambda *shape: nn.Parameter(torch.randn(*shape) * sigma)
        triple = lambda: (init_weight(num_inputs, num_hiddens),
                          init_weight(num_hiddens, num_hiddens),
                          nn.Parameter(torch.zeros(num_hiddens)))
        self.W_xi, self.W_hi, self.b_i = triple()  # Input gate
        self.W_xf, self.W_hf, self.b_f = triple()  # Forget gate
        self.W_xo, self.W_ho, self.b_o = triple()  # Output gate
        self.W_xc, self.W_hc, self.b_c = triple()  # Input node

    def forward(self, inputs, H_C=None):
        if H_C is None:
            # Initial state with shape: (batch_size, num_hiddens)
            H = torch.zeros((inputs.shape[1], self.num_hiddens),
                        device=inputs.device)
            C = torch.zeros((inputs.shape[1], self.num_hiddens),
                        device=inputs.device)
        else:
            H, C = H_C
        outputs = []
        for X in inputs:
            I = torch.sigmoid(torch.matmul(X, self.W_xi) +
                            torch.matmul(H, self.W_hi) + self.b_i)
            F = torch.sigmoid(torch.matmul(X, self.W_xf) +
                            torch.matmul(H, self.W_hf) + self.b_f)
            O = torch.sigmoid(torch.matmul(X, self.W_xo) +
                            torch.matmul(H, self.W_ho) + self.b_o)
            C_tilde = torch.tanh(torch.matmul(X, self.W_xc) +
                            torch.matmul(H, self.W_hc) + self.b_c)
            C = F * C + I * C_tilde
            H = O * torch.tanh(C)
            outputs.append(H)
        return outputs, (H, C)

    

            

In [13]:
data = LanguageData(batch_size=1024, num_steps=32)
lstm = LSTMScratch(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLMScratch(lstm, vocab_size=len(data.vocab), lr=4)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Training on cuda
Epoch 10, Train Loss: 2.7175, Val Loss: 2.6605
Epoch 20, Train Loss: 2.3602, Val Loss: 2.3193
Epoch 30, Train Loss: 2.1932, Val Loss: 2.2437
Epoch 40, Train Loss: 2.0721, Val Loss: 2.1293
Epoch 50, Train Loss: 1.9519, Val Loss: 2.0443
Epoch 60, Train Loss: 1.8565, Val Loss: 2.0358
Epoch 70, Train Loss: 1.7631, Val Loss: 1.9534
Epoch 80, Train Loss: 1.6781, Val Loss: 1.9551
Epoch 90, Train Loss: 1.6153, Val Loss: 1.9478
Epoch 100, Train Loss: 1.5642, Val Loss: 1.9615
Epoch 110, Train Loss: 1.5301, Val Loss: 1.9727
Epoch 120, Train Loss: 1.4865, Val Loss: 1.9878
Epoch 130, Train Loss: 1.4570, Val Loss: 2.0108
Epoch 140, Train Loss: 1.4241, Val Loss: 2.0013
Epoch 150, Train Loss: 1.3861, Val Loss: 2.0358
Epoch 160, Train Loss: 1.3667, Val Loss: 2.0282
Epoch 170, Train Loss: 1.3336, Val Loss: 2.0574
Epoch 180, Train Loss: 1.3324, Val Loss: 2.0956
Epoch 190, Train Loss: 1.3139, Val Loss: 2.1019
Epoch 200, Train Loss: 1.2999, Val Loss: 2.0886


In [14]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has of the time travell'

## Concise Implementation

In [15]:
class LSTM(RNN):
    def __init__(self, num_inputs, num_hiddens):
        nn.Module.__init__(self)
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.rnn = nn.LSTM(num_inputs, num_hiddens)

    def forward(self, inputs, H_C=None):
        return self.rnn(inputs, H_C)

lstm = LSTM(num_inputs=len(data.vocab), num_hiddens=32)

In [16]:
data = LanguageData(batch_size=1024, num_steps=32)
model = RNNLM(lstm, vocab_size=len(data.vocab), lr=4)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

Training on cuda
Epoch 10, Train Loss: 2.4301, Val Loss: 2.4004
Epoch 20, Train Loss: 2.1455, Val Loss: 2.2037
Epoch 30, Train Loss: 2.0205, Val Loss: 2.0649
Epoch 40, Train Loss: 1.8780, Val Loss: 2.0414
Epoch 50, Train Loss: 1.7798, Val Loss: 1.9965
Epoch 60, Train Loss: 1.6860, Val Loss: 1.9929
Epoch 70, Train Loss: 1.6362, Val Loss: 1.9879
Epoch 80, Train Loss: 1.5686, Val Loss: 2.0155
Epoch 90, Train Loss: 1.5354, Val Loss: 1.9819
Epoch 100, Train Loss: 1.4912, Val Loss: 1.9927
Epoch 110, Train Loss: 1.4480, Val Loss: 2.0496
Epoch 120, Train Loss: 1.4335, Val Loss: 2.0229
Epoch 130, Train Loss: 1.4191, Val Loss: 2.0452
Epoch 140, Train Loss: 1.3762, Val Loss: 2.0287
Epoch 150, Train Loss: 1.3575, Val Loss: 2.0636
Epoch 160, Train Loss: 1.3426, Val Loss: 2.0870
Epoch 170, Train Loss: 1.3227, Val Loss: 2.0914
Epoch 180, Train Loss: 1.3166, Val Loss: 2.1021
Epoch 190, Train Loss: 1.2948, Val Loss: 2.1037
Epoch 200, Train Loss: 1.2720, Val Loss: 2.1632


'it has youns and mension a'

# Gated Recurrent Units (GRUs)

## Implementation From Scratch

In [17]:
class GRUScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.sigma = sigma

        init_weight = lambda *shape: nn.Parameter(torch.randn(*shape) * sigma)
        triple = lambda: (init_weight(num_inputs, num_hiddens),
                          init_weight(num_hiddens, num_hiddens),
                          nn.Parameter(torch.zeros(num_hiddens)))
        self.W_xz, self.W_hz, self.b_z = triple()  # Update gate
        self.W_xr, self.W_hr, self.b_r = triple()  # Reset gate
        self.W_xh, self.W_hh, self.b_h = triple()  # Candidate hidden state

    def forward(self, inputs, H=None):
        if H is None:
            H = torch.zeros((inputs.shape[1], self.num_hiddens), device=inputs.device)
        outputs = []
        for X in inputs:
            Z = torch.sigmoid(torch.matmul(X, self.W_xz) +
                              torch.matmul(H, self.W_hz) + self.b_z)
            R = torch.sigmoid(torch.matmul(X, self.W_xr) +
                              torch.matmul(H, self.W_hr) + self.b_r)
            H_tilde = torch.tanh(torch.matmul(X, self.W_xh) +
                                 torch.matmul(R * H, self.W_hh) + self.b_h)
            H = Z * H + (1 - Z) * H_tilde
            outputs.append(H)
        outputs = torch.stack(outputs)  # Stack list into tensor
        return outputs, H

In [18]:
data = LanguageData(batch_size=1024, num_steps=32)
gruscratch = GRUScratch(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLM(gruscratch, vocab_size=len(data.vocab), lr=4)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Training on cuda
Epoch 10, Train Loss: 2.2851, Val Loss: 2.2562
Epoch 20, Train Loss: 2.0607, Val Loss: 2.1652
Epoch 30, Train Loss: 1.9018, Val Loss: 2.0258
Epoch 40, Train Loss: 1.7926, Val Loss: 1.9965
Epoch 50, Train Loss: 1.6673, Val Loss: 1.9585
Epoch 60, Train Loss: 1.5773, Val Loss: 1.9623
Epoch 70, Train Loss: 1.5252, Val Loss: 1.9864
Epoch 80, Train Loss: 1.4635, Val Loss: 2.0156
Epoch 90, Train Loss: 1.4318, Val Loss: 2.0521
Epoch 100, Train Loss: 1.4042, Val Loss: 2.0542
Epoch 110, Train Loss: 1.3697, Val Loss: 2.0964
Epoch 120, Train Loss: 1.3511, Val Loss: 2.0894
Epoch 130, Train Loss: 1.3249, Val Loss: 2.1302
Epoch 140, Train Loss: 1.2952, Val Loss: 2.1508
Epoch 150, Train Loss: 1.2992, Val Loss: 2.1484
Epoch 160, Train Loss: 1.3023, Val Loss: 2.1509
Epoch 170, Train Loss: 1.2698, Val Loss: 2.1830
Epoch 180, Train Loss: 1.2506, Val Loss: 2.1686
Epoch 190, Train Loss: 1.2599, Val Loss: 2.2371
Epoch 200, Train Loss: 1.2600, Val Loss: 2.2635


In [19]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has thing to the time t'

## Concise Implementation

In [20]:
class GRU(RNN):
    def __init__(self, num_inputs, num_hiddens):
        nn.Module.__init__(self)
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.rnn = nn.GRU(num_inputs, num_hiddens)
    
    

In [21]:
data = LanguageData(batch_size=1024, num_steps=32)
gru = GRU(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLM(gru, vocab_size=len(data.vocab), lr=4)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Training on cuda
Epoch 10, Train Loss: 2.2407, Val Loss: 2.2230
Epoch 20, Train Loss: 1.9903, Val Loss: 2.0666
Epoch 30, Train Loss: 1.8522, Val Loss: 2.0428
Epoch 40, Train Loss: 1.7271, Val Loss: 1.9647
Epoch 50, Train Loss: 1.6330, Val Loss: 1.9423
Epoch 60, Train Loss: 1.5688, Val Loss: 1.9671
Epoch 70, Train Loss: 1.4939, Val Loss: 1.9882
Epoch 80, Train Loss: 1.4751, Val Loss: 2.0005
Epoch 90, Train Loss: 1.4237, Val Loss: 2.0276
Epoch 100, Train Loss: 1.4097, Val Loss: 2.0526
Epoch 110, Train Loss: 1.3964, Val Loss: 2.0987
Epoch 120, Train Loss: 1.3713, Val Loss: 2.0640
Epoch 130, Train Loss: 1.3459, Val Loss: 2.0612
Epoch 140, Train Loss: 1.3410, Val Loss: 2.0900
Epoch 150, Train Loss: 1.3395, Val Loss: 2.1356
Epoch 160, Train Loss: 1.3183, Val Loss: 2.1181
Epoch 170, Train Loss: 1.2905, Val Loss: 2.0967
Epoch 180, Train Loss: 1.2750, Val Loss: 2.1060
Epoch 190, Train Loss: 1.2938, Val Loss: 2.1364
Epoch 200, Train Loss: 1.2850, Val Loss: 2.1280


In [22]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has the time traveller '

# Deep Recurrent Neural Networks

# Implementation From Scratch

In [23]:
class StackedRNNScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, num_layers, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.sigma = sigma
        self.rnns = nn.Sequential(*[RNNScratch(
            num_inputs if i==0 else num_hiddens, num_hiddens, sigma)
                                    for i in range(num_layers)])
    
    def forward(self, inputs, Hs=None):
        outputs = inputs
        if Hs is None: Hs = [None] * self.num_layers
        for i in range(self.num_layers):
            outputs, Hs[i] = self.rnns[i](outputs, Hs[i])
            outputs = torch.stack(outputs, 0)
        return outputs, Hs

In [24]:
data = LanguageData(batch_size=1024, num_steps=32)
rnn_block = StackedRNNScratch(num_inputs=len(data.vocab),
                              num_hiddens=32, num_layers=2)
model = RNNLMScratch(rnn_block, vocab_size=len(data.vocab), lr=2)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Training on cuda
Epoch 10, Train Loss: 2.8423, Val Loss: 2.8214
Epoch 20, Train Loss: 2.8368, Val Loss: 2.8212
Epoch 30, Train Loss: 2.5868, Val Loss: 2.5858
Epoch 40, Train Loss: 2.2705, Val Loss: 2.2876
Epoch 50, Train Loss: 2.1398, Val Loss: 2.1948
Epoch 60, Train Loss: 1.9977, Val Loss: 2.0891
Epoch 70, Train Loss: 1.9052, Val Loss: 2.0595
Epoch 80, Train Loss: 1.8268, Val Loss: 2.0986
Epoch 90, Train Loss: 1.7891, Val Loss: 2.0449
Epoch 100, Train Loss: 1.6818, Val Loss: 2.0425
Epoch 110, Train Loss: 1.6570, Val Loss: 2.0561
Epoch 120, Train Loss: 1.5920, Val Loss: 2.0450
Epoch 130, Train Loss: 1.6088, Val Loss: 2.0517
Epoch 140, Train Loss: 1.5335, Val Loss: 2.0535
Epoch 150, Train Loss: 1.5011, Val Loss: 2.0876
Epoch 160, Train Loss: 1.4789, Val Loss: 2.0725
Epoch 170, Train Loss: 1.4981, Val Loss: 2.1358
Epoch 180, Train Loss: 1.4629, Val Loss: 2.1198
Epoch 190, Train Loss: 1.4526, Val Loss: 2.1216
Epoch 200, Train Loss: 1.4735, Val Loss: 2.0824


In [25]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has of the proved the p'

# Concise Implementation

In [26]:
class GRU(RNN):  #@save
    """The multilayer GRU model."""
    def __init__(self, num_inputs, num_hiddens, num_layers, dropout=0):
        nn.Module.__init__(self)
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.dropout = dropout
        self.rnn = nn.GRU(num_inputs, num_hiddens, num_layers,
                          dropout=dropout)

In [27]:
data = LanguageData(batch_size=1024, num_steps=32)
gru = GRU(num_inputs=len(data.vocab), num_hiddens=32, num_layers=2)
model = RNNLM(gru, vocab_size=len(data.vocab), lr=2)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Training on cuda
Epoch 10, Train Loss: 2.6735, Val Loss: 2.5305
Epoch 20, Train Loss: 2.2897, Val Loss: 2.2963
Epoch 30, Train Loss: 2.1237, Val Loss: 2.1621
Epoch 40, Train Loss: 1.9957, Val Loss: 2.1063
Epoch 50, Train Loss: 1.8789, Val Loss: 2.0036
Epoch 60, Train Loss: 1.7641, Val Loss: 1.9516
Epoch 70, Train Loss: 1.6722, Val Loss: 1.9261
Epoch 80, Train Loss: 1.5723, Val Loss: 1.9102
Epoch 90, Train Loss: 1.4981, Val Loss: 1.9337
Epoch 100, Train Loss: 1.4297, Val Loss: 1.9403
Epoch 110, Train Loss: 1.3680, Val Loss: 1.9632
Epoch 120, Train Loss: 1.3282, Val Loss: 2.0108
Epoch 130, Train Loss: 1.2738, Val Loss: 2.0772
Epoch 140, Train Loss: 1.2383, Val Loss: 2.0972
Epoch 150, Train Loss: 1.1791, Val Loss: 2.1531
Epoch 160, Train Loss: 1.1384, Val Loss: 2.1647
Epoch 170, Train Loss: 1.1293, Val Loss: 2.2357
Epoch 180, Train Loss: 1.0942, Val Loss: 2.2575
Epoch 190, Train Loss: 1.0686, Val Loss: 2.3129
Epoch 200, Train Loss: 1.0260, Val Loss: 2.3763


In [28]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has or miner the lince '

# Bidirectional Recurrent Neural Networks

## Implementation From Scratch

In [29]:
class BiRNNScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.sigma = sigma
        self.f_rnn = RNNScratch(num_inputs, num_hiddens, sigma)
        self.b_rnn = RNNScratch(num_inputs, num_hiddens, sigma)
        self.num_hiddens *= 2  # The output dimension will be doubled
    
    def forward(self, inputs, Hs=None):
        f_H, b_H = Hs if Hs is not None else (None, None)
        f_outputs, f_H = self.f_rnn(inputs, f_H)
        b_outputs, b_H = self.b_rnn(reversed(inputs), b_H)
        outputs = [torch.cat((f, b), -1) for f, b in zip(
            f_outputs, reversed(b_outputs))]
        return outputs, (f_H, b_H)

In [30]:
data = LanguageData(batch_size=1024, num_steps=32)
birnn = BiRNNScratch(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLMScratch(birnn, vocab_size=len(data.vocab), lr=2)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Training on cuda
Epoch 10, Train Loss: 0.9252, Val Loss: 0.7820
Epoch 20, Train Loss: 0.1961, Val Loss: 0.1883
Epoch 30, Train Loss: 0.1108, Val Loss: 0.1075
Epoch 40, Train Loss: 0.0921, Val Loss: 0.0906
Epoch 50, Train Loss: 0.0846, Val Loss: 0.0842
Epoch 60, Train Loss: 0.0805, Val Loss: 0.0810
Epoch 70, Train Loss: 0.0780, Val Loss: 0.0789
Epoch 80, Train Loss: 0.0762, Val Loss: 0.0775
Epoch 90, Train Loss: 0.0748, Val Loss: 0.0765
Epoch 100, Train Loss: 0.0739, Val Loss: 0.0756
Epoch 110, Train Loss: 0.0731, Val Loss: 0.0749
Epoch 120, Train Loss: 0.0724, Val Loss: 0.0743
Epoch 130, Train Loss: 0.0718, Val Loss: 0.0738
Epoch 140, Train Loss: 0.0713, Val Loss: 0.0734
Epoch 150, Train Loss: 0.0709, Val Loss: 0.0730
Epoch 160, Train Loss: 0.0705, Val Loss: 0.0726
Epoch 170, Train Loss: 0.0702, Val Loss: 0.0723
Epoch 180, Train Loss: 0.0699, Val Loss: 0.0721
Epoch 190, Train Loss: 0.0696, Val Loss: 0.0718
Epoch 200, Train Loss: 0.0693, Val Loss: 0.0715


In [31]:
model.predict('it has', 20, data.vocab, device=device)

'it hasasasasasasasasasasas'

## Consise Implementation

In [32]:
class BiGRU(RNN):
    def __init__(self, num_inputs, num_hiddens):
        super().__init__(num_inputs=num_inputs, num_hiddens=num_hiddens)
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.sigma = 0.01
        self.rnn = nn.GRU(num_inputs, num_hiddens, bidirectional=True)
        self.num_hiddens *= 2  # The output dimension will be doubled

    def forward(self, inputs, Hs=None):
        return self.rnn(inputs, Hs)

In [33]:
data = LanguageData(batch_size=1024, num_steps=32)
BiGRU = BiGRU(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLMScratch(BiGRU, vocab_size=len(data.vocab), lr=2)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Training on cuda
Epoch 10, Train Loss: 0.9109, Val Loss: 0.7642
Epoch 20, Train Loss: 0.1980, Val Loss: 0.1908
Epoch 30, Train Loss: 0.1170, Val Loss: 0.1165
Epoch 40, Train Loss: 0.0963, Val Loss: 0.0972
Epoch 50, Train Loss: 0.0874, Val Loss: 0.0889
Epoch 60, Train Loss: 0.0824, Val Loss: 0.0844
Epoch 70, Train Loss: 0.0792, Val Loss: 0.0815
Epoch 80, Train Loss: 0.0771, Val Loss: 0.0796
Epoch 90, Train Loss: 0.0756, Val Loss: 0.0781
Epoch 100, Train Loss: 0.0744, Val Loss: 0.0770
Epoch 110, Train Loss: 0.0735, Val Loss: 0.0761
Epoch 120, Train Loss: 0.0727, Val Loss: 0.0754
Epoch 130, Train Loss: 0.0721, Val Loss: 0.0748
Epoch 140, Train Loss: 0.0715, Val Loss: 0.0743
Epoch 150, Train Loss: 0.0710, Val Loss: 0.0738
Epoch 160, Train Loss: 0.0706, Val Loss: 0.0734
Epoch 170, Train Loss: 0.0703, Val Loss: 0.0731
Epoch 180, Train Loss: 0.0699, Val Loss: 0.0728
Epoch 190, Train Loss: 0.0696, Val Loss: 0.0725
Epoch 200, Train Loss: 0.0694, Val Loss: 0.0722


In [34]:
evice = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it hasasasasasasasasasasas'

# Machine Translation

In [38]:
reduce_sum = lambda x, *args, **kwargs: x.sum(*args, **kwargs)
reduce_sum = lambda x, axis: x.sum(axis=axis, keepdim=True)
astype = lambda x, *args, **kwargs: x.type(*args, **kwargs)

class MTFraEng(DataModule):
    """The English-French dataset.

    Defined in :numref:`sec_machine_translation`"""
    def _download(self):
        with open('fra-eng/fra.txt', encoding='utf-8') as f:
            return f.read()

    def _preprocess(self, text):
        """Defined in :numref:`sec_machine_translation`"""
        # Replace non-breaking space with space
        text = text.replace('\u202f', ' ').replace('\xa0', ' ')
        # Insert space between words and punctuation marks
        no_space = lambda char, prev_char: char in ',.!?' and prev_char != ' '
        out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
               for i, char in enumerate(text.lower())]
        return ''.join(out)

    def _tokenize(self, text, max_examples=None):
        """Defined in :numref:`sec_machine_translation`"""
        src, tgt = [], []
        for i, line in enumerate(text.split('\n')):
            if max_examples and i > max_examples: break
            parts = line.split('\t')
            if len(parts) == 2:
                # Skip empty tokens
                src.append([t for t in f'{parts[0]} <eos>'.split(' ') if t])
                tgt.append([t for t in f'{parts[1]} <eos>'.split(' ') if t])
        return src, tgt

    def __init__(self, batch_size, num_steps=9, num_train=512, num_val=128):
        """Defined in :numref:`sec_machine_translation`"""
        super(MTFraEng, self).__init__()
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.num_train = num_train
        self.num_val = num_val
        self.arrays, self.src_vocab, self.tgt_vocab = self._build_arrays(
            self._download())

    def _build_arrays(self, raw_text, src_vocab=None, tgt_vocab=None):
        """Defined in :numref:`subsec_loading-seq-fixed-len`"""
        def _build_array(sentences, vocab, is_tgt=False):
            pad_or_trim = lambda seq, t: (
                seq[:t] if len(seq) > t else seq + ['<pad>'] * (t - len(seq)))
            sentences = [pad_or_trim(s, self.num_steps) for s in sentences]
            if is_tgt:
                sentences = [['<bos>'] + s for s in sentences]
            if vocab is None:
                vocab = Vocab(sentences, min_freq=2)
            array = torch.tensor([vocab[s] for s in sentences])
            valid_len = reduce_sum(
                astype(array != vocab['<pad>'], torch.int32), 1)
            return array, vocab, valid_len
        src, tgt = self._tokenize(self._preprocess(raw_text),
                                  self.num_train + self.num_val)
        src_array, src_vocab, src_valid_len = _build_array(src, src_vocab)
        tgt_array, tgt_vocab, _ = _build_array(tgt, tgt_vocab, True)
        return ((src_array, tgt_array[:,:-1], src_valid_len, tgt_array[:,1:]),
                src_vocab, tgt_vocab)

    def get_dataloader(self, train):
        """Defined in :numref:`subsec_loading-seq-fixed-len`"""
        idx = slice(0, self.num_train) if train else slice(self.num_train, None)
        return self.get_tensorloader(self.arrays, train, idx)

    def build(self, src_sentences, tgt_sentences):
        """Defined in :numref:`subsec_loading-seq-fixed-len`"""
        raw_text = '\n'.join([src + '\t' + tgt for src, tgt in zip(
            src_sentences, tgt_sentences)])
        arrays, _, _ = self._build_arrays(
            raw_text, self.src_vocab, self.tgt_vocab)
        return arrays

In [40]:
# Machine Translation

#Reduce sum
reduce_sum = lambda x, axis: x.sum(axis=axis, keepdim=True)
#astype
astype = lambda x, y: x.type(y)

class MTFraEng(DataModule):
    def _download(self):
        with open('fra-eng/fra.txt', encoding='utf-8') as f:
            return f.read()

    def _preprocess(self, text):
        # Replace non-breaking space with space
        text = text.replace('\u202f', ' ').replace('\xa0', ' ')
        # Insert space between words and punctuation marks
        no_space = lambda char, prev_char: char in ',.!?' and prev_char != ' '
        out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
               for i, char in enumerate(text.lower())]
        return ''.join(out)

    def _tokenize(self, text, max_examples=None):
        src, tgt = [], []
        for i, line in enumerate(text.split('\n')):
            if max_examples and i > max_examples: break
            parts = line.split('\t')
            if len(parts) == 2:
                # Skip empty tokens
                src.append([t for t in f'{parts[0]} <eos>'.split(' ') if t])
                tgt.append([t for t in f'{parts[1]} <eos>'.split(' ') if t])
        return src, tgt

    def __init__(self, batch_size, num_steps=9, num_train=512, num_val=128):
        super(MTFraEng, self).__init__()
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.num_train = num_train
        self.num_val = num_val
        self.arrays, self.src_vocab, self.tgt_vocab = self._build_arrays(
            self._download())

    def _build_arrays(self, raw_text, src_vocab=None, tgt_vocab=None):
        def _build_array(sentences, vocab, is_tgt=False):
            pad_or_trim = lambda seq, t: (
                seq[:t] if len(seq) > t else seq + ['<pad>'] * (t - len(seq)))
            sentences = [pad_or_trim(s, self.num_steps) for s in sentences]
            if is_tgt:
                sentences = [['<bos>'] + s for s in sentences]
            if vocab is None:
                vocab = Vocab(sentences, min_freq=2)
            array = torch.tensor([vocab[s] for s in sentences])
            # Fix: Ensure array is always 2D
            if array.dim() == 1:
                array = array.unsqueeze(0)  # Add batch dimension
            valid_len = reduce_sum(
                astype(array != vocab['<pad>'], torch.int32), 1)
            return array, vocab, valid_len
        
        src, tgt = self._tokenize(self._preprocess(raw_text),
                                self.num_train + self.num_val)
        src_array, src_vocab, src_valid_len = _build_array(src, src_vocab)
        tgt_array, tgt_vocab, _ = _build_array(tgt, tgt_vocab, True)
        
        # Fix: Ensure both arrays are 2D before slicing
        if tgt_array.dim() == 1:
            tgt_array = tgt_array.unsqueeze(0)
        if src_array.dim() == 1:
            src_array = src_array.unsqueeze(0)
        
        return ((src_array, tgt_array[:,:-1], src_valid_len, tgt_array[:,1:]),
                src_vocab, tgt_vocab)
    
    def get_dataloader(self, train):
        idx = slice(0, self.num_train) if train else slice(self.num_train, None)
        return self.get_tensorloader(self.arrays, train, idx)

    def build(self, src_sentences, tgt_sentences):
        raw_text = '\n'.join([src + '\t' + tgt for src, tgt in zip(
            src_sentences, tgt_sentences)])
        arrays, _, _ = self._build_arrays(
            raw_text, self.src_vocab, self.tgt_vocab)
        return arrays

In [41]:
data = MTFraEng(batch_size=3, num_steps=9, num_train=100, num_val=20)

# Encoder-Decoder Architecture

## Encoder

In [42]:
class Encoder(nn.Module):  
    """The base encoder interface for the encoder--decoder architecture."""
    def __init__(self):
        super().__init__()

    def forward(self, X, *args):
        raise NotImplementedError

## Decoder

In [43]:
class Decoder(nn.Module): 
    """The base decoder interface for the encoder--decoder architecture."""
    def __init__(self):
        super().__init__()

    def init_state(self, enc_all_outputs, *args):
        raise NotImplementedError

    def forward(self, X, state):
        raise NotImplementedError

## Putting them together

In [44]:
class EncoderDecoder(nn.Module):
    """The base class for the encoder--decoder architecture."""
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, enc_X, dec_X, *args):
        enc_all_outputs = self.encoder(enc_X, *args)
        dec_state = self.decoder.init_state(enc_all_outputs, *args)
        # Return decoder output only
        return self.decoder(dec_X, dec_state)[0]

# Sequence-to-sequence Learning For Machine Translation

## Encoder

In [45]:
def init_seq2seq(module): 
    """Initialize weights for sequence-to-sequence learning."""
    if type(module) == nn.Linear:
         nn.init.xavier_uniform_(module.weight)
    if type(module) == nn.GRU:
        for param in module._flat_weights_names:
            if "weight" in param:
                nn.init.xavier_uniform_(module._parameters[param])

class Seq2SeqEncoder(Encoder): 
    """The RNN encoder for sequence-to-sequence learning."""
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, num_hiddens, num_layers, dropout=dropout)
        self.apply(init_seq2seq)

    def forward(self, X, *args):
        # X shape: (batch_size, num_steps)
        embs = self.embedding(X.t().type(torch.int64))
        # embs shape: (num_steps, batch_size, embed_size)
        outputs, state = self.rnn(embs)
        # outputs shape: (num_steps, batch_size, num_hiddens)
        # state shape: (num_layers, batch_size, num_hiddens)
        return outputs, state

In [46]:
vocab_size, embed_size, num_hiddens, num_layers = 10, 8, 16, 2
batch_size, num_steps = 4, 9
encoder = Seq2SeqEncoder(vocab_size, embed_size, num_hiddens, num_layers)
X = torch.zeros((batch_size, num_steps))
enc_outputs, enc_state = encoder(X)
print(f"Expected shape: ({num_steps}, {batch_size}, {num_hiddens})")
print(f"Actual shape: {enc_outputs.shape}")
print(f"Shape match: {enc_outputs.shape == (num_steps, batch_size, num_hiddens)}")

Expected shape: (9, 4, 16)
Actual shape: torch.Size([9, 4, 16])
Shape match: True


## Decoder

In [48]:
class Seq2SeqDecoder(Decoder):
    """The RNN decoder for sequence to sequence learning."""
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size+num_hiddens, num_hiddens,
                           num_layers, dropout=dropout)
        self.dense = nn.LazyLinear(vocab_size)
        self.apply(init_seq2seq)

    def init_state(self, enc_all_outputs, *args):
        return enc_all_outputs

    def forward(self, X, state):
        # X shape: (batch_size, num_steps)
        # embs shape: (num_steps, batch_size, embed_size)
        embs = self.embedding(X.t().type(torch.int32))
        enc_output, hidden_state = state
        # context shape: (batch_size, num_hiddens)
        context = enc_output[-1]
        # Broadcast context to (num_steps, batch_size, num_hiddens)
        context = context.repeat(embs.shape[0], 1, 1)
        # Concat at the feature dimension
        embs_and_context = torch.cat((embs, context), -1)
        outputs, hidden_state = self.rnn(embs_and_context, hidden_state)
        outputs = self.dense(outputs).swapaxes(0, 1)
        # outputs shape: (batch_size, num_steps, vocab_size)
        # hidden_state shape: (num_layers, batch_size, num_hiddens)
        return outputs, [enc_output, hidden_state]

In [49]:
decoder = Seq2SeqDecoder(vocab_size, embed_size, num_hiddens, num_layers)
state = decoder.init_state(encoder(X))
dec_outputs, state = decoder(X, state)
print(f"Expected shape: ({batch_size}, {num_steps}, {vocab_size})")
print(f"Actual shape: {dec_outputs.shape}")
print(f"Shape match: {dec_outputs.shape == (batch_size, num_steps, vocab_size)}")
print(f"Expected shape: ({num_layers}, {batch_size}, {num_hiddens})")
print(f"Actual shape: {state[1].shape}")
print(f"Shape match: {state[1].shape == (num_layers, batch_size, num_hiddens)}")


Expected shape: (4, 9, 10)
Actual shape: torch.Size([4, 9, 10])
Shape match: True
Expected shape: (2, 4, 16)
Actual shape: torch.Size([2, 4, 16])
Shape match: True


## Encoder-Decoder for Sequence-to-Sequence Learning

In [50]:
class Seq2Seq(EncoderDecoder): 
    """The RNN encoder--decoder for sequence to sequence learning."""
    def __init__(self, encoder, decoder, tgt_pad, lr):
        super().__init__(encoder, decoder)
        self.encoder = encoder
        self.decoder = decoder
        self.tgt_pad = tgt_pad
        self.lr = lr

    def validation_step(self, batch):
        Y_hat = self(*batch[:-1])
        self.plot('loss', self.loss(Y_hat, batch[-1]), train=False)

    def configure_optimizers(self):
        # Adam optimizer is used here
        return torch.optim.Adam(self.parameters(), lr=self.lr)

    def loss(self, Y_hat, Y):
        # Calculate cross-entropy loss directly
        l = nn.CrossEntropyLoss(reduction='none')(Y_hat.reshape(-1, Y_hat.shape[-1]), Y.reshape(-1))
        mask = (Y.reshape(-1) != self.tgt_pad).type(torch.float32)
        return (l * mask).sum() / mask.sum()

    def predict_step(self, batch, device, num_steps,
                 save_attention_weights=False):
        batch = [a.to(device) for a in batch]
        src, tgt, src_valid_len, _ = batch
        enc_all_outputs = self.encoder(src, src_valid_len)
        dec_state = self.decoder.init_state(enc_all_outputs, src_valid_len)
        outputs, attention_weights = [tgt[:, (0)].unsqueeze(1), ], []
        for _ in range(num_steps):
            Y, dec_state = self.decoder(outputs[-1], dec_state)
            outputs.append(Y.argmax(2))
            # Save attention weights (to be covered later)
            if save_attention_weights:
                attention_weights.append(self.decoder.attention_weights)
        return torch.cat(outputs[1:], 1), attention_weights

In [51]:
data = MTFraEng(batch_size=32, num_steps=9, num_train=100, num_val=20)
embed_size, num_hiddens, num_layers, dropout = 256, 256, 2, 0.2
encoder = Seq2SeqEncoder(
    len(data.src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqDecoder(
    len(data.tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = Seq2Seq(encoder, decoder, tgt_pad=data.tgt_vocab['<pad>'],
                lr=0.005)

In [53]:
# Custom training loop for sequence-to-sequence model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on {device}")
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Define loss function
criterion = nn.CrossEntropyLoss(reduction='none')

# Get the data arrays directly
train_arrays = data.arrays[True]
val_arrays = data.arrays[False]

# Fix: Unpack if nested
if len(train_arrays) == 1 and isinstance(train_arrays[0], tuple):
    train_arrays = train_arrays[0]
    val_arrays = val_arrays[0]

print(f"Training samples: {len(train_arrays[0])}")
print(f"Validation samples: {len(val_arrays[0])}")

for epoch in range(30):
    model.train()
    total_loss = 0
    
    # Process training data in batches
    for i in range(0, len(train_arrays[0]), data.batch_size):
        # Get batch
        batch_src = train_arrays[0][i:i+data.batch_size]
        batch_tgt = train_arrays[1][i:i+data.batch_size]
        batch_src_valid_len = train_arrays[2][i:i+data.batch_size]
        batch_tgt_shifted = train_arrays[3][i:i+data.batch_size]
        
        # Move to device
        src = batch_src.to(device)
        tgt = batch_tgt.to(device)
        src_valid_len = batch_src_valid_len.to(device)
        tgt_shifted = batch_tgt_shifted.to(device)
        
        optimizer.zero_grad()
        outputs = model(src, tgt)
        
        # Calculate loss manually
        l = criterion(outputs.reshape(-1, outputs.shape[-1]), tgt_shifted.reshape(-1))
        mask = (tgt_shifted.reshape(-1) != model.tgt_pad).type(torch.float32)
        loss = (l * mask).sum() / mask.sum()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / (len(train_arrays[0]) // data.batch_size)

    # Validation
    avg_val_loss = 0.0
    if len(val_arrays[0]) > 0:
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for i in range(0, len(val_arrays[0]), data.batch_size):
                # Get validation batch
                batch_src_val = val_arrays[0][i:i+data.batch_size]
                batch_tgt_val = val_arrays[1][i:i+data.batch_size]
                batch_src_valid_len_val = val_arrays[2][i:i+data.batch_size]
                batch_tgt_shifted_val = val_arrays[3][i:i+data.batch_size]
                
                # Move to device
                src_val = batch_src_val.to(device)
                tgt_val = batch_tgt_val.to(device)
                src_valid_len_val = batch_src_valid_len_val.to(device)
                tgt_shifted_val = batch_tgt_shifted_val.to(device)
                
                val_outputs = model(src_val, tgt_val)
                
                # Calculate validation loss manually
                v_l = criterion(val_outputs.reshape(-1, val_outputs.shape[-1]), tgt_shifted_val.reshape(-1))
                v_mask = (tgt_shifted_val.reshape(-1) != model.tgt_pad).type(torch.float32)
                v_loss = (v_l * v_mask).sum() / v_mask.sum()
                
                val_loss += v_loss.item()
        
        avg_val_loss = val_loss / (len(val_arrays[0]) // data.batch_size)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}, Train Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

Training on cuda
Training samples: 0
Validation samples: 0


ZeroDivisionError: division by zero

In [None]:
def bleu(pred_seq, label_seq, k):  #@save
    """Compute the BLEU."""
    pred_tokens, label_tokens = pred_seq.split(' '), label_seq.split(' ')
    len_pred, len_label = len(pred_tokens), len(label_tokens)
    score = math.exp(min(0, 1 - len_label / len_pred))
    for n in range(1, min(k, len_pred) + 1):
        num_matches, label_subs = 0, collections.defaultdict(int)
        for i in range(len_label - n + 1):
            label_subs[' '.join(label_tokens[i: i + n])] += 1
        for i in range(len_pred - n + 1):
            if label_subs[' '.join(pred_tokens[i: i + n])] > 0:
                num_matches += 1
                label_subs[' '.join(pred_tokens[i: i + n])] -= 1
        score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n))
    return score

In [None]:
engs = ['go .', 'i lost .', 'he\'s calm .', 'i\'m home .']
fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .']
preds, _ = model.predict_step(
    data.build(engs, fras), 'cuda', data.num_steps)
for en, fr, p in zip(engs, fras, preds):
    translation = []
    for token in data.tgt_vocab.to_tokens(p):
        if token == '<eos>':
            break
        translation.append(token)
    print(f'{en} => {translation}, bleu,'
          f'{bleu(" ".join(translation), fr, k=2):.3f}')

go . => ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>'], bleu,0.000
i lost . => ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>'], bleu,0.000
he's calm . => ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>'], bleu,0.000
i'm home . => ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>'], bleu,0.000
