# Imports

In [3]:
import torch
import torch.nn as nn
from torchvision import transforms
from matplotlib import pyplot as plt
import torchvision
from torch.nn import functional as F
import re
import collections

# LSTM

## Helper Functions

In [4]:

def preprocess(text):
    return re.sub('[^A-Za-z]+', ' ', text).lower()

text = ""
full_text = ""
with open('The Time Machine - Sample.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()
    full_text = preprocess(raw_text)

In [52]:
class Vocab:
    """Vocabulary for text."""
    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        """Defined in :numref:`sec_text-sequence`"""
        # Flatten a 2D list if needed
        if tokens and isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        # Count token frequencies
        counter = collections.Counter(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        # The list of unique tokens
        self.idx_to_token = list(sorted(set(['<unk>'] + reserved_tokens + [
            token for token, freq in self.token_freqs if freq >= min_freq])))
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if hasattr(indices, '__len__') and len(indices) > 1:
            return [self.idx_to_token[int(index)] for index in indices]
        return self.idx_to_token[indices]

    @property
    def unk(self):  # Index for the unknown token
        return self.token_to_idx['<unk>']
    
def tokenize(text):
    return list(text)

def build(raw_text, vocab=None):
    tokens = tokenize(preprocess(raw_text))
    if vocab is None: vocab = Vocab(tokens)
    corpus = [vocab[token] for token in tokens]
    return corpus, vocab


In [6]:
class LanguageData(nn.Module):
    def _download(self):
        fname = "The Time Machine - Sample.txt"
        with open(fname, encoding='utf-8') as f:
            return f.read()

    def _preprocess(self, text):
        """Defined in :numref:`sec_text-sequence`"""
        return re.sub('[^A-Za-z]+', ' ', text).lower()

    def _tokenize(self, text):
        """Defined in :numref:`sec_text-sequence`"""
        return list(text)

    def build(self, raw_text, vocab=None):
        """Defined in :numref:`sec_text-sequence`"""
        tokens = self._tokenize(self._preprocess(raw_text))
        if vocab is None: vocab = Vocab(tokens)
        corpus = [vocab[token] for token in tokens]
        return corpus, vocab

    def __init__(self, batch_size, num_steps, num_train=10000, num_val=5000):
        """Defined in :numref:`sec_language-model`"""
        super().__init__()
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.num_train = num_train
        self.num_val = num_val
        corpus, self.vocab = self.build(self._download())
        array = torch.tensor([corpus[i:i+num_steps+1]
                            for i in range(len(corpus)-num_steps)])
        self.X, self.Y = array[:,:-1], array[:,1:]

    def get_dataloader(self, train):
        """Defined in :numref:`subsec_partitioning-seqs`"""
        idx = slice(0, self.num_train) if train else slice(
            self.num_train, self.num_train + self.num_val)
        return self.get_tensorloader([self.X, self.Y], train, idx)
    
    def get_tensorloader(self, tensors, train, idx_slice):
        dataset = torch.utils.data.TensorDataset(
            tensors[0][idx_slice], tensors[1][idx_slice]
        )
        return torch.utils.data.DataLoader(
            dataset, batch_size=self.batch_size, shuffle=train
        )

In [7]:
class RNNScratch(nn.Module):
    """The RNN model implemented from scratch.

    Defined in :numref:`sec_rnn-scratch`"""
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.sigma = sigma
        self.W_xh = nn.Parameter(
            torch.randn(num_inputs, num_hiddens) * sigma)
        self.W_hh = nn.Parameter(
            torch.randn(num_hiddens, num_hiddens) * sigma)
        self.b_h = nn.Parameter(torch.zeros(num_hiddens))

    def forward(self, inputs, state=None):
        """Defined in :numref:`sec_rnn-scratch`"""
        if state is None:
            # Initial state with shape: (batch_size, num_hiddens)
            state = torch.zeros((inputs.shape[1], self.num_hiddens),
                              device=inputs.device)
        else:
            state, = state
        outputs = []
        for X in inputs:  # Shape of inputs: (num_steps, batch_size, num_inputs)
            state = torch.tanh(torch.matmul(X, self.W_xh) +
                             torch.matmul(state, self.W_hh) + self.b_h)
            outputs.append(state)
        return outputs, state

In [8]:
class RNNLMScratch(nn.Module): 
    """The RNN-based language model implemented from scratch."""
    def __init__(self, rnn, vocab_size, lr=0.01):
        super().__init__()
        self.rnn = rnn
        self.vocab_size = vocab_size
        self.lr = lr
        self.init_params()

    def init_params(self):
        self.W_hq = nn.Parameter(
            torch.randn(
                self.rnn.num_hiddens, self.vocab_size) * self.rnn.sigma)
        self.b_q = nn.Parameter(torch.zeros(self.vocab_size))

    def training_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        self.plot('ppl', torch.exp(l), train=True)
        return l

    def validation_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        self.plot('ppl', torch.exp(l), train=False)
    
    def one_hot(self, X):
        # Output shape: (num_steps, batch_size, vocab_size)
        return F.one_hot(X.T, self.vocab_size).type(torch.float32)
    
    def output_layer(self, rnn_outputs):
        outputs = [torch.matmul(H, self.W_hq) + self.b_q for H in rnn_outputs]
        return torch.stack(outputs, 1)
    
    
    
    def clip_gradients(self, grad_clip_val, model):
        params = [p for p in model.parameters() if p.requires_grad]
        norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
        if norm > grad_clip_val:
            for param in params:
                param.grad[:] *= grad_clip_val / norm

    def forward(self, X, state=None):
        """Defined in :numref:`sec_rnn-scratch`"""
        embs = self.one_hot(X)
        rnn_outputs, _ = self.rnn(embs, state)
        return self.output_layer(rnn_outputs)

    def predict(self, prefix, num_preds, vocab, device=None):
        """Defined in :numref:`sec_rnn-scratch`"""
        state, outputs = None, [vocab[prefix[0]]]
        for i in range(len(prefix) + num_preds - 1):
            X = torch.tensor([[outputs[-1]]], device=device)
            embs = self.one_hot(X)
            rnn_outputs, state = self.rnn(embs, state)
            if i < len(prefix) - 1:  # Warm-up period
                outputs.append(vocab[prefix[i + 1]])
            else:  # Predict num_preds steps
                Y = self.output_layer(rnn_outputs)
                outputs.append(int(torch.reshape(torch.argmax(Y, axis=2), (1,))))
        return ''.join([vocab.idx_to_token[i] for i in outputs])


In [9]:
class RNN(nn.Module):  #@save
    """The RNN model implemented with high-level APIs."""
    def __init__(self, num_inputs, num_hiddens):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        # Initialize the RNN layer 
        self.rnn = nn.RNN(num_inputs, num_hiddens)

    def forward(self, inputs, H=None):
        return self.rnn(inputs, H)

In [10]:
class RNNLM(RNNLMScratch):  #@save
    """The RNN-based language model implemented with high-level APIs."""
    def init_params(self):
        self.linear = nn.LazyLinear(self.vocab_size)

    def output_layer(self, hiddens):
        return self.linear(hiddens).swapaxes(0, 1)


In [11]:
def train(model, data, criterion, optimizer, num_epochs=100, grad_clip=1):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for X, Y in data.get_dataloader(train=True):
            X, Y = X.to(device), Y.to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs.reshape(-1, outputs.shape[-1]), Y.reshape(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip)
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(data.get_dataloader(train=True))

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_val, Y_val in data.get_dataloader(train=False):
                X_val, Y_val = X_val.to(device), Y_val.to(device)
                val_outputs = model(X_val)
                v_loss = criterion(val_outputs.reshape(-1, val_outputs.shape[-1]), Y_val.reshape(-1))
                val_loss += v_loss.item()
        avg_val_loss = val_loss / len(data.get_dataloader(train=False))

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}, Train Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

In [60]:
class DataModule:
    def __init__(self, root='../data'):
        self.root = root

    def get_dataloader(self, train):
        raise NotImplementedError

    def train_dataloader(self):
        return self.get_dataloader(train=True)

    def val_dataloader(self):
        return self.get_dataloader(train=False)

    def get_tensorloader(self, tensors, train, indices=slice(0, None)):
        tensors = tuple(a[indices] for a in tensors)
        dataset = torch.utils.data.TensorDataset(*tensors)
        return torch.utils.data.DataLoader(dataset, self.batch_size,
                                           shuffle=train)

## Implementation From Scratch

In [12]:
class LSTMScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens  
        self.sigma = sigma

        init_weight = lambda *shape: nn.Parameter(torch.randn(*shape) * sigma)
        triple = lambda: (init_weight(num_inputs, num_hiddens),
                          init_weight(num_hiddens, num_hiddens),
                          nn.Parameter(torch.zeros(num_hiddens)))
        self.W_xi, self.W_hi, self.b_i = triple()  # Input gate
        self.W_xf, self.W_hf, self.b_f = triple()  # Forget gate
        self.W_xo, self.W_ho, self.b_o = triple()  # Output gate
        self.W_xc, self.W_hc, self.b_c = triple()  # Input node

    def forward(self, inputs, H_C=None):
        if H_C is None:
            # Initial state with shape: (batch_size, num_hiddens)
            H = torch.zeros((inputs.shape[1], self.num_hiddens),
                        device=inputs.device)
            C = torch.zeros((inputs.shape[1], self.num_hiddens),
                        device=inputs.device)
        else:
            H, C = H_C
        outputs = []
        for X in inputs:
            I = torch.sigmoid(torch.matmul(X, self.W_xi) +
                            torch.matmul(H, self.W_hi) + self.b_i)
            F = torch.sigmoid(torch.matmul(X, self.W_xf) +
                            torch.matmul(H, self.W_hf) + self.b_f)
            O = torch.sigmoid(torch.matmul(X, self.W_xo) +
                            torch.matmul(H, self.W_ho) + self.b_o)
            C_tilde = torch.tanh(torch.matmul(X, self.W_xc) +
                            torch.matmul(H, self.W_hc) + self.b_c)
            C = F * C + I * C_tilde
            H = O * torch.tanh(C)
            outputs.append(H)
        return outputs, (H, C)

    

            

In [13]:
data = LanguageData(batch_size=1024, num_steps=32)
lstm = LSTMScratch(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLMScratch(lstm, vocab_size=len(data.vocab), lr=4)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Epoch 10, Train Loss: 2.7088, Val Loss: 2.6571
Epoch 20, Train Loss: 2.3641, Val Loss: 2.3168
Epoch 30, Train Loss: 2.1869, Val Loss: 2.2027
Epoch 40, Train Loss: 2.0476, Val Loss: 2.1019
Epoch 50, Train Loss: 1.9135, Val Loss: 2.0197
Epoch 60, Train Loss: 1.8205, Val Loss: 1.9956
Epoch 70, Train Loss: 1.7222, Val Loss: 1.9564
Epoch 80, Train Loss: 1.6446, Val Loss: 1.9523
Epoch 90, Train Loss: 1.5915, Val Loss: 1.9457
Epoch 100, Train Loss: 1.5285, Val Loss: 1.9630
Epoch 110, Train Loss: 1.4944, Val Loss: 1.9349
Epoch 120, Train Loss: 1.4629, Val Loss: 1.9636
Epoch 130, Train Loss: 1.4284, Val Loss: 1.9909
Epoch 140, Train Loss: 1.3971, Val Loss: 2.0054
Epoch 150, Train Loss: 1.3635, Val Loss: 2.0093
Epoch 160, Train Loss: 1.3501, Val Loss: 2.0311
Epoch 170, Train Loss: 1.3215, Val Loss: 2.0382
Epoch 180, Train Loss: 1.3067, Val Loss: 2.0505
Epoch 190, Train Loss: 1.2822, Val Loss: 2.0683
Epoch 200, Train Loss: 1.2731, Val Loss: 2.1052


In [14]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has a so in the german '

## Concise Implementation

In [15]:
class LSTM(RNN):
    def __init__(self, num_inputs, num_hiddens):
        nn.Module.__init__(self)
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.rnn = nn.LSTM(num_inputs, num_hiddens)

    def forward(self, inputs, H_C=None):
        return self.rnn(inputs, H_C)

lstm = LSTM(num_inputs=len(data.vocab), num_hiddens=32)

In [16]:
data = LanguageData(batch_size=1024, num_steps=32)
model = RNNLM(lstm, vocab_size=len(data.vocab), lr=4)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

Epoch 10, Train Loss: 2.4146, Val Loss: 2.3773
Epoch 20, Train Loss: 2.1465, Val Loss: 2.1725
Epoch 30, Train Loss: 1.9817, Val Loss: 2.0954
Epoch 40, Train Loss: 1.8703, Val Loss: 2.0158
Epoch 50, Train Loss: 1.7800, Val Loss: 1.9962
Epoch 60, Train Loss: 1.7051, Val Loss: 1.9677
Epoch 70, Train Loss: 1.6464, Val Loss: 1.9678
Epoch 80, Train Loss: 1.5794, Val Loss: 1.9685
Epoch 90, Train Loss: 1.5441, Val Loss: 1.9715
Epoch 100, Train Loss: 1.5053, Val Loss: 1.9587
Epoch 110, Train Loss: 1.4541, Val Loss: 1.9722
Epoch 120, Train Loss: 1.4376, Val Loss: 1.9987
Epoch 130, Train Loss: 1.4158, Val Loss: 2.0073
Epoch 140, Train Loss: 1.4006, Val Loss: 2.0105
Epoch 150, Train Loss: 1.3763, Val Loss: 2.0418
Epoch 160, Train Loss: 1.3647, Val Loss: 2.0605
Epoch 170, Train Loss: 1.3282, Val Loss: 2.0384
Epoch 180, Train Loss: 1.3246, Val Loss: 2.0884
Epoch 190, Train Loss: 1.3007, Val Loss: 2.0760
Epoch 200, Train Loss: 1.2944, Val Loss: 2.0749


'it has of a mather an and '

# Gated Recurrent Units (GRUs)

## Implementation From Scratch

In [17]:
class GRUScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.sigma = sigma

        init_weight = lambda *shape: nn.Parameter(torch.randn(*shape) * sigma)
        triple = lambda: (init_weight(num_inputs, num_hiddens),
                          init_weight(num_hiddens, num_hiddens),
                          nn.Parameter(torch.zeros(num_hiddens)))
        self.W_xz, self.W_hz, self.b_z = triple()  # Update gate
        self.W_xr, self.W_hr, self.b_r = triple()  # Reset gate
        self.W_xh, self.W_hh, self.b_h = triple()  # Candidate hidden state

    def forward(self, inputs, H=None):
        if H is None:
            H = torch.zeros((inputs.shape[1], self.num_hiddens), device=inputs.device)
        outputs = []
        for X in inputs:
            Z = torch.sigmoid(torch.matmul(X, self.W_xz) +
                              torch.matmul(H, self.W_hz) + self.b_z)
            R = torch.sigmoid(torch.matmul(X, self.W_xr) +
                              torch.matmul(H, self.W_hr) + self.b_r)
            H_tilde = torch.tanh(torch.matmul(X, self.W_xh) +
                                 torch.matmul(R * H, self.W_hh) + self.b_h)
            H = Z * H + (1 - Z) * H_tilde
            outputs.append(H)
        outputs = torch.stack(outputs)  # Stack list into tensor
        return outputs, H

In [18]:
data = LanguageData(batch_size=1024, num_steps=32)
gruscratch = GRUScratch(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLM(gruscratch, vocab_size=len(data.vocab), lr=4)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Epoch 10, Train Loss: 2.2841, Val Loss: 2.2779
Epoch 20, Train Loss: 2.0408, Val Loss: 2.1516
Epoch 30, Train Loss: 1.9200, Val Loss: 2.0534
Epoch 40, Train Loss: 1.7732, Val Loss: 1.9736
Epoch 50, Train Loss: 1.6670, Val Loss: 1.9513
Epoch 60, Train Loss: 1.5774, Val Loss: 1.9577
Epoch 70, Train Loss: 1.5210, Val Loss: 2.0112
Epoch 80, Train Loss: 1.4570, Val Loss: 2.0196
Epoch 90, Train Loss: 1.4177, Val Loss: 2.0410
Epoch 100, Train Loss: 1.4048, Val Loss: 2.1077
Epoch 110, Train Loss: 1.3668, Val Loss: 2.0886
Epoch 120, Train Loss: 1.3337, Val Loss: 2.1117
Epoch 130, Train Loss: 1.3406, Val Loss: 2.1284
Epoch 140, Train Loss: 1.3075, Val Loss: 2.1511
Epoch 150, Train Loss: 1.2880, Val Loss: 2.1690
Epoch 160, Train Loss: 1.3116, Val Loss: 2.1951
Epoch 170, Train Loss: 1.2695, Val Loss: 2.1903
Epoch 180, Train Loss: 1.2634, Val Loss: 2.1893
Epoch 190, Train Loss: 1.2309, Val Loss: 2.2347
Epoch 200, Train Loss: 1.2331, Val Loss: 2.2156


In [19]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has a mite at sochight '

## Concise Implementation

In [20]:
class GRU(RNN):
    def __init__(self, num_inputs, num_hiddens):
        nn.Module.__init__(self)
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.rnn = nn.GRU(num_inputs, num_hiddens)
    
    

In [21]:
data = LanguageData(batch_size=1024, num_steps=32)
gru = GRU(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLM(gru, vocab_size=len(data.vocab), lr=4)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Epoch 10, Train Loss: 2.2376, Val Loss: 2.2204
Epoch 20, Train Loss: 2.0194, Val Loss: 2.1080
Epoch 30, Train Loss: 1.8511, Val Loss: 2.0052
Epoch 40, Train Loss: 1.7170, Val Loss: 1.9654
Epoch 50, Train Loss: 1.6247, Val Loss: 1.9545
Epoch 60, Train Loss: 1.5354, Val Loss: 1.9874
Epoch 70, Train Loss: 1.4950, Val Loss: 1.9949
Epoch 80, Train Loss: 1.4493, Val Loss: 2.0399
Epoch 90, Train Loss: 1.4365, Val Loss: 2.0811
Epoch 100, Train Loss: 1.3947, Val Loss: 2.0711
Epoch 110, Train Loss: 1.3799, Val Loss: 2.1144
Epoch 120, Train Loss: 1.3501, Val Loss: 2.0949
Epoch 130, Train Loss: 1.3411, Val Loss: 2.1111
Epoch 140, Train Loss: 1.3268, Val Loss: 2.1653
Epoch 150, Train Loss: 1.3219, Val Loss: 2.1604
Epoch 160, Train Loss: 1.3250, Val Loss: 2.1541
Epoch 170, Train Loss: 1.3151, Val Loss: 2.1527
Epoch 180, Train Loss: 1.2928, Val Loss: 2.1824
Epoch 190, Train Loss: 1.2841, Val Loss: 2.1603
Epoch 200, Train Loss: 1.2782, Val Loss: 2.2223


In [22]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has of course the prove'

# Deep Recurrent Neural Networks

# Implementation From Scratch

In [23]:
class StackedRNNScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, num_layers, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.sigma = sigma
        self.rnns = nn.Sequential(*[RNNScratch(
            num_inputs if i==0 else num_hiddens, num_hiddens, sigma)
                                    for i in range(num_layers)])
    
    def forward(self, inputs, Hs=None):
        outputs = inputs
        if Hs is None: Hs = [None] * self.num_layers
        for i in range(self.num_layers):
            outputs, Hs[i] = self.rnns[i](outputs, Hs[i])
            outputs = torch.stack(outputs, 0)
        return outputs, Hs

In [24]:
data = LanguageData(batch_size=1024, num_steps=32)
rnn_block = StackedRNNScratch(num_inputs=len(data.vocab),
                              num_hiddens=32, num_layers=2)
model = RNNLMScratch(rnn_block, vocab_size=len(data.vocab), lr=2)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Epoch 10, Train Loss: 2.8396, Val Loss: 2.8229
Epoch 20, Train Loss: 2.8276, Val Loss: 2.8007
Epoch 30, Train Loss: 2.5537, Val Loss: 2.4737
Epoch 40, Train Loss: 2.2566, Val Loss: 2.2551
Epoch 50, Train Loss: 2.0935, Val Loss: 2.1659
Epoch 60, Train Loss: 1.9976, Val Loss: 2.1216
Epoch 70, Train Loss: 1.8881, Val Loss: 2.0663
Epoch 80, Train Loss: 1.8163, Val Loss: 2.0127
Epoch 90, Train Loss: 1.7493, Val Loss: 2.0306
Epoch 100, Train Loss: 1.7006, Val Loss: 2.0811
Epoch 110, Train Loss: 1.6356, Val Loss: 2.0553
Epoch 120, Train Loss: 1.6043, Val Loss: 2.0981
Epoch 130, Train Loss: 1.5676, Val Loss: 2.0388
Epoch 140, Train Loss: 1.5728, Val Loss: 2.1009
Epoch 150, Train Loss: 1.5041, Val Loss: 2.0374
Epoch 160, Train Loss: 1.4739, Val Loss: 2.0400
Epoch 170, Train Loss: 1.4937, Val Loss: 2.1322
Epoch 180, Train Loss: 1.4868, Val Loss: 2.0960
Epoch 190, Train Loss: 1.4463, Val Loss: 2.0914
Epoch 200, Train Loss: 1.4511, Val Loss: 2.0952


In [25]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it hasted the time travell'

# Concise Implementation

In [26]:
class GRU(RNN):  #@save
    """The multilayer GRU model."""
    def __init__(self, num_inputs, num_hiddens, num_layers, dropout=0):
        nn.Module.__init__(self)
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.dropout = dropout
        self.rnn = nn.GRU(num_inputs, num_hiddens, num_layers,
                          dropout=dropout)

In [27]:
data = LanguageData(batch_size=1024, num_steps=32)
gru = GRU(num_inputs=len(data.vocab), num_hiddens=32, num_layers=2)
model = RNNLM(gru, vocab_size=len(data.vocab), lr=2)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Epoch 10, Train Loss: 2.6674, Val Loss: 2.6077
Epoch 20, Train Loss: 2.2953, Val Loss: 2.2730
Epoch 30, Train Loss: 2.1201, Val Loss: 2.1842
Epoch 40, Train Loss: 2.0122, Val Loss: 2.0806
Epoch 50, Train Loss: 1.8669, Val Loss: 2.0612
Epoch 60, Train Loss: 1.7480, Val Loss: 1.9482
Epoch 70, Train Loss: 1.6693, Val Loss: 1.9288
Epoch 80, Train Loss: 1.5894, Val Loss: 1.9295
Epoch 90, Train Loss: 1.5115, Val Loss: 1.9230
Epoch 100, Train Loss: 1.4389, Val Loss: 1.9346
Epoch 110, Train Loss: 1.3887, Val Loss: 1.9505
Epoch 120, Train Loss: 1.3410, Val Loss: 1.9960
Epoch 130, Train Loss: 1.2757, Val Loss: 2.0441
Epoch 140, Train Loss: 1.2314, Val Loss: 2.0689
Epoch 150, Train Loss: 1.1773, Val Loss: 2.1012
Epoch 160, Train Loss: 1.1385, Val Loss: 2.1560
Epoch 170, Train Loss: 1.1079, Val Loss: 2.1809
Epoch 180, Train Loss: 1.1089, Val Loss: 2.2326
Epoch 190, Train Loss: 1.0608, Val Loss: 2.2697
Epoch 200, Train Loss: 1.0547, Val Loss: 2.3215


In [28]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has introduction is fou'

# Bidirectional Recurrent Neural Networks

## Implementation From Scratch

In [29]:
class BiRNNScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.sigma = sigma
        self.f_rnn = RNNScratch(num_inputs, num_hiddens, sigma)
        self.b_rnn = RNNScratch(num_inputs, num_hiddens, sigma)
        self.num_hiddens *= 2  # The output dimension will be doubled
    
    def forward(self, inputs, Hs=None):
        f_H, b_H = Hs if Hs is not None else (None, None)
        f_outputs, f_H = self.f_rnn(inputs, f_H)
        b_outputs, b_H = self.b_rnn(reversed(inputs), b_H)
        outputs = [torch.cat((f, b), -1) for f, b in zip(
            f_outputs, reversed(b_outputs))]
        return outputs, (f_H, b_H)

In [33]:
data = LanguageData(batch_size=1024, num_steps=32)
birnn = BiRNNScratch(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLMScratch(birnn, vocab_size=len(data.vocab), lr=2)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Epoch 10, Train Loss: 1.5660, Val Loss: 1.0378
Epoch 20, Train Loss: 0.2039, Val Loss: 0.1926
Epoch 30, Train Loss: 0.1128, Val Loss: 0.1089
Epoch 40, Train Loss: 0.0932, Val Loss: 0.0914
Epoch 50, Train Loss: 0.0851, Val Loss: 0.0845
Epoch 60, Train Loss: 0.0806, Val Loss: 0.0809
Epoch 70, Train Loss: 0.0780, Val Loss: 0.0787
Epoch 80, Train Loss: 0.0762, Val Loss: 0.0772
Epoch 90, Train Loss: 0.0749, Val Loss: 0.0761
Epoch 100, Train Loss: 0.0739, Val Loss: 0.0753
Epoch 110, Train Loss: 0.0730, Val Loss: 0.0746
Epoch 120, Train Loss: 0.0725, Val Loss: 0.0741
Epoch 130, Train Loss: 0.0719, Val Loss: 0.0736
Epoch 140, Train Loss: 0.0715, Val Loss: 0.0732
Epoch 150, Train Loss: 0.0710, Val Loss: 0.0729
Epoch 160, Train Loss: 0.0707, Val Loss: 0.0726
Epoch 170, Train Loss: 0.0704, Val Loss: 0.0723
Epoch 180, Train Loss: 0.0700, Val Loss: 0.0721
Epoch 190, Train Loss: 0.0697, Val Loss: 0.0718
Epoch 200, Train Loss: 0.0695, Val Loss: 0.0717


In [34]:
model.predict('it has', 20, data.vocab, device=device)

'it hasasasasasasasasasasas'

## Consise Implementation

In [38]:
class BiGRU(RNN):
    def __init__(self, num_inputs, num_hiddens):
        super().__init__(num_inputs=num_inputs, num_hiddens=num_hiddens)
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.sigma = 0.01
        self.rnn = nn.GRU(num_inputs, num_hiddens, bidirectional=True)
        self.num_hiddens *= 2  # The output dimension will be doubled

    def forward(self, inputs, Hs=None):
        return self.rnn(inputs, Hs)

In [39]:
data = LanguageData(batch_size=1024, num_steps=32)
BiGRU = BiGRU(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLMScratch(BiGRU, vocab_size=len(data.vocab), lr=2)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Epoch 10, Train Loss: 0.9438, Val Loss: 0.7833
Epoch 20, Train Loss: 0.1979, Val Loss: 0.1891
Epoch 30, Train Loss: 0.1134, Val Loss: 0.1121
Epoch 40, Train Loss: 0.0934, Val Loss: 0.0937
Epoch 50, Train Loss: 0.0855, Val Loss: 0.0863
Epoch 60, Train Loss: 0.0811, Val Loss: 0.0823
Epoch 70, Train Loss: 0.0785, Val Loss: 0.0799
Epoch 80, Train Loss: 0.0766, Val Loss: 0.0782
Epoch 90, Train Loss: 0.0754, Val Loss: 0.0770
Epoch 100, Train Loss: 0.0744, Val Loss: 0.0761
Epoch 110, Train Loss: 0.0736, Val Loss: 0.0754
Epoch 120, Train Loss: 0.0728, Val Loss: 0.0748
Epoch 130, Train Loss: 0.0723, Val Loss: 0.0742
Epoch 140, Train Loss: 0.0718, Val Loss: 0.0738
Epoch 150, Train Loss: 0.0713, Val Loss: 0.0734
Epoch 160, Train Loss: 0.0709, Val Loss: 0.0731
Epoch 170, Train Loss: 0.0706, Val Loss: 0.0728
Epoch 180, Train Loss: 0.0703, Val Loss: 0.0725
Epoch 190, Train Loss: 0.0700, Val Loss: 0.0722
Epoch 200, Train Loss: 0.0697, Val Loss: 0.0720


In [40]:
model.predict('it has', 20, data.vocab)

'it hasasasasasasasasasasas'

# Machine Translation

In [101]:
reduce_sum = lambda x, *args, **kwargs: x.sum(*args, **kwargs)
astype = lambda x, *args, **kwargs: x.type(*args, **kwargs)

class MTFraEng(DataModule):
    def _download(self):
        with open('Fra Eng Bilingual Sentence Pairs/fra.txt', encoding='utf-8') as f:
            return f.read()

    def _preprocess(self, text):
        """Defined in :numref:`sec_machine_translation`"""
        # Replace non-breaking space with space
        text = text.replace('\u202f', ' ').replace('\xa0', ' ')
        # Insert space between words and punctuation marks
        no_space = lambda char, prev_char: char in ',.!?' and prev_char != ' '
        out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
               for i, char in enumerate(text.lower())]
        return ''.join(out)

    def _tokenize(self, text, max_examples=None):
        """Defined in :numref:`sec_machine_translation`"""
        src, tgt = [], []
        for i, line in enumerate(text.split('\n')):
            if max_examples and i > max_examples: break
            parts = line.split('\t')
            if len(parts) == 2:
                # Skip empty tokens
                src.append([t for t in f'{parts[0]} <eos>'.split(' ') if t])
                tgt.append([t for t in f'{parts[1]} <eos>'.split(' ') if t])
        return src, tgt

    def __init__(self, batch_size, num_steps=9, num_train=512, num_val=128):
        """Defined in :numref:`sec_machine_translation`"""
        super(MTFraEng, self).__init__()
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.num_train = num_train
        self.num_val = num_val
        self.arrays, self.src_vocab, self.tgt_vocab = self._build_arrays(
            self._download())

    def _build_arrays(self, raw_text, src_vocab=None, tgt_vocab=None):
        """Defined in :numref:`subsec_loading-seq-fixed-len`"""
        def _build_array(sentences, vocab, is_tgt=False):
            pad_or_trim = lambda seq, t: (
                seq[:t] if len(seq) > t else seq + ['<pad>'] * (t - len(seq)))
            sentences = [pad_or_trim(s, self.num_steps) for s in sentences]
            if is_tgt:
                sentences = [['<bos>'] + s for s in sentences]
            if vocab is None:
                vocab = Vocab(sentences, min_freq=2)
            array = torch.tensor([vocab[s] for s in sentences])
            valid_len = reduce_sum(
                astype(array != vocab['<pad>'], torch.int32), 1)
            return array, vocab, valid_len
        src, tgt = self._tokenize(self._preprocess(raw_text),
                                  self.num_train + self.num_val)
        src_array, src_vocab, src_valid_len = _build_array(src, src_vocab)
        tgt_array, tgt_vocab, _ = _build_array(tgt, tgt_vocab, True)
        return ((src_array, tgt_array[:,:-1], src_valid_len, tgt_array[:,1:]),
                src_vocab, tgt_vocab)

    def get_dataloader(self, train):
        """Defined in :numref:`subsec_loading-seq-fixed-len`"""
        idx = slice(0, self.num_train) if train else slice(self.num_train, None)
        return self.get_tensorloader(self.arrays, train, idx)

    def build(self, src_sentences, tgt_sentences):
        """Defined in :numref:`subsec_loading-seq-fixed-len`"""
        raw_text = '\n'.join([src + '\t' + tgt for src, tgt in zip(
            src_sentences, tgt_sentences)])
        arrays, _, _ = self._build_arrays(
            raw_text, self.src_vocab, self.tgt_vocab)
        return arrays

In [102]:
data = MTFraEng(batch_size=3)
src, tgt, src_valid_len, label = next(iter(data.train_dataloader()))
print('source:', src.type(torch.int32))
print('decoder input:', tgt.type(torch.int32))
print('source len excluding pad:', src_valid_len.type(torch.int32))
print('label:', label.type(torch.int32))

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)