# Imports

In [1]:
import torch
import torch.nn as nn
from torchvision import transforms
from matplotlib import pyplot as plt
import torchvision
from torch.nn import functional as F
import re
import collections
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 5070


# LSTM

## Helper Functions

In [2]:

def preprocess(text):
    return re.sub('[^A-Za-z]+', ' ', text).lower()

text = ""
full_text = ""
with open('The Time Machine - Sample.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()
    full_text = preprocess(raw_text)

In [3]:
class Vocab: 
    """Vocabulary for text."""
    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        # Flatten a 2D list if needed
        if tokens and isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        # Count token frequencies
        counter = collections.Counter(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        # The list of unique tokens
        self.idx_to_token = list(sorted(set(['<unk>'] + reserved_tokens + [
            token for token, freq in self.token_freqs if freq >= min_freq])))
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if hasattr(indices, '__len__') and len(indices) > 1:
            return [self.idx_to_token[int(index)] for index in indices]
        return self.idx_to_token[indices]

    @property
    def unk(self):  # Index for the unknown token
        return self.token_to_idx['<unk>']
    
def tokenize(text):
    return list(text)

def build(raw_text, vocab=None):
    tokens = tokenize(preprocess(raw_text))
    if vocab is None: vocab = Vocab(tokens)
    corpus = [vocab[token] for token in tokens]
    return corpus, vocab


In [4]:
class LanguageData(nn.Module):
    def _download(self):
        fname = "The Time Machine - Sample.txt"
        with open(fname, encoding='utf-8') as f:
            return f.read()

    def _preprocess(self, text):
        """Defined in :numref:`sec_text-sequence`"""
        return re.sub('[^A-Za-z]+', ' ', text).lower()

    def _tokenize(self, text):
        """Defined in :numref:`sec_text-sequence`"""
        return list(text)

    def build(self, raw_text, vocab=None):
        """Defined in :numref:`sec_text-sequence`"""
        tokens = self._tokenize(self._preprocess(raw_text))
        if vocab is None: vocab = Vocab(tokens)
        corpus = [vocab[token] for token in tokens]
        return corpus, vocab

    def __init__(self, batch_size, num_steps, num_train=10000, num_val=5000):
        """Defined in :numref:`sec_language-model`"""
        super().__init__()
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.num_train = num_train
        self.num_val = num_val
        corpus, self.vocab = self.build(self._download())
        array = torch.tensor([corpus[i:i+num_steps+1]
                            for i in range(len(corpus)-num_steps)])
        self.X, self.Y = array[:,:-1], array[:,1:]

    def get_dataloader(self, train):
        """Defined in :numref:`subsec_partitioning-seqs`"""
        idx = slice(0, self.num_train) if train else slice(
            self.num_train, self.num_train + self.num_val)
        return self.get_tensorloader([self.X, self.Y], train, idx)
    
    def get_tensorloader(self, tensors, train, idx_slice):
        dataset = torch.utils.data.TensorDataset(
            tensors[0][idx_slice], tensors[1][idx_slice]
        )
        return torch.utils.data.DataLoader(
            dataset, batch_size=self.batch_size, shuffle=train
        )

In [5]:
class RNNScratch(nn.Module):
    """The RNN model implemented from scratch.

    Defined in :numref:`sec_rnn-scratch`"""
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.sigma = sigma
        self.W_xh = nn.Parameter(
            torch.randn(num_inputs, num_hiddens) * sigma)
        self.W_hh = nn.Parameter(
            torch.randn(num_hiddens, num_hiddens) * sigma)
        self.b_h = nn.Parameter(torch.zeros(num_hiddens))

    def forward(self, inputs, state=None):
        """Defined in :numref:`sec_rnn-scratch`"""
        if state is None:
            # Initial state with shape: (batch_size, num_hiddens)
            state = torch.zeros((inputs.shape[1], self.num_hiddens),
                              device=inputs.device)
        else:
            state, = state
        outputs = []
        for X in inputs:  # Shape of inputs: (num_steps, batch_size, num_inputs)
            state = torch.tanh(torch.matmul(X, self.W_xh) +
                             torch.matmul(state, self.W_hh) + self.b_h)
            outputs.append(state)
        return outputs, state

In [6]:
class RNNLMScratch(nn.Module): 
    """The RNN-based language model implemented from scratch."""
    def __init__(self, rnn, vocab_size, lr=0.01):
        super().__init__()
        self.rnn = rnn
        self.vocab_size = vocab_size
        self.lr = lr
        self.init_params()

    def init_params(self):
        self.W_hq = nn.Parameter(
            torch.randn(
                self.rnn.num_hiddens, self.vocab_size) * self.rnn.sigma)
        self.b_q = nn.Parameter(torch.zeros(self.vocab_size))

    def training_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        self.plot('ppl', torch.exp(l), train=True)
        return l

    def validation_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        self.plot('ppl', torch.exp(l), train=False)
    
    def one_hot(self, X):
        # Output shape: (num_steps, batch_size, vocab_size)
        return F.one_hot(X.T, self.vocab_size).type(torch.float32)
    
    def output_layer(self, rnn_outputs):
        outputs = [torch.matmul(H, self.W_hq) + self.b_q for H in rnn_outputs]
        return torch.stack(outputs, 1)
    
    
    
    def clip_gradients(self, grad_clip_val, model):
        params = [p for p in model.parameters() if p.requires_grad]
        norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
        if norm > grad_clip_val:
            for param in params:
                param.grad[:] *= grad_clip_val / norm

    def forward(self, X, state=None):
        """Defined in :numref:`sec_rnn-scratch`"""
        embs = self.one_hot(X)
        rnn_outputs, _ = self.rnn(embs, state)
        return self.output_layer(rnn_outputs)

    def predict(self, prefix, num_preds, vocab, device=None):
        """Defined in :numref:`sec_rnn-scratch`"""
        state, outputs = None, [vocab[prefix[0]]]
        for i in range(len(prefix) + num_preds - 1):
            X = torch.tensor([[outputs[-1]]], device=device)
            embs = self.one_hot(X)
            rnn_outputs, state = self.rnn(embs, state)
            if i < len(prefix) - 1:  # Warm-up period
                outputs.append(vocab[prefix[i + 1]])
            else:  # Predict num_preds steps
                Y = self.output_layer(rnn_outputs)
                outputs.append(int(torch.reshape(torch.argmax(Y, axis=2), (1,))))
        return ''.join([vocab.idx_to_token[i] for i in outputs])


In [7]:
class RNN(nn.Module):  #@save
    """The RNN model implemented with high-level APIs."""
    def __init__(self, num_inputs, num_hiddens):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        # Initialize the RNN layer 
        self.rnn = nn.RNN(num_inputs, num_hiddens)

    def forward(self, inputs, H=None):
        return self.rnn(inputs, H)

In [8]:
class RNNLM(RNNLMScratch):  #@save
    """The RNN-based language model implemented with high-level APIs."""
    def init_params(self):
        self.linear = nn.LazyLinear(self.vocab_size)

    def output_layer(self, hiddens):
        return self.linear(hiddens).swapaxes(0, 1)


In [9]:
def train(model, data, criterion, optimizer, num_epochs=100, grad_clip=1):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for X, Y in data.get_dataloader(train=True):
            X, Y = X.to(device), Y.to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs.reshape(-1, outputs.shape[-1]), Y.reshape(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip)
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(data.get_dataloader(train=True))

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_val, Y_val in data.get_dataloader(train=False):
                X_val, Y_val = X_val.to(device), Y_val.to(device)
                val_outputs = model(X_val)
                v_loss = criterion(val_outputs.reshape(-1, val_outputs.shape[-1]), Y_val.reshape(-1))
                val_loss += v_loss.item()
        avg_val_loss = val_loss / len(data.get_dataloader(train=False))

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}, Train Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

def train(model, data, criterion, optimizer, num_epochs=100, grad_clip=1):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for X, Y in data.get_dataloader(train=True):
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs.reshape(-1, outputs.shape[-1]), Y.reshape(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip)
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(data.get_dataloader(train=True))

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_val, Y_val in data.get_dataloader(train=False):
                val_outputs = model(X_val)
                v_loss = criterion(val_outputs.reshape(-1, val_outputs.shape[-1]), Y_val.reshape(-1))
                val_loss += v_loss.item()
        avg_val_loss = val_loss / len(data.get_dataloader(train=False))

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}, Train Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

## Implementation From Scratch

In [10]:
class LSTMScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens  
        self.sigma = sigma

        init_weight = lambda *shape: nn.Parameter(torch.randn(*shape) * sigma)
        triple = lambda: (init_weight(num_inputs, num_hiddens),
                          init_weight(num_hiddens, num_hiddens),
                          nn.Parameter(torch.zeros(num_hiddens)))
        self.W_xi, self.W_hi, self.b_i = triple()  # Input gate
        self.W_xf, self.W_hf, self.b_f = triple()  # Forget gate
        self.W_xo, self.W_ho, self.b_o = triple()  # Output gate
        self.W_xc, self.W_hc, self.b_c = triple()  # Input node

    def forward(self, inputs, H_C=None):
        if H_C is None:
            # Initial state with shape: (batch_size, num_hiddens)
            H = torch.zeros((inputs.shape[1], self.num_hiddens),
                        device=inputs.device)
            C = torch.zeros((inputs.shape[1], self.num_hiddens),
                        device=inputs.device)
        else:
            H, C = H_C
        outputs = []
        for X in inputs:
            I = torch.sigmoid(torch.matmul(X, self.W_xi) +
                            torch.matmul(H, self.W_hi) + self.b_i)
            F = torch.sigmoid(torch.matmul(X, self.W_xf) +
                            torch.matmul(H, self.W_hf) + self.b_f)
            O = torch.sigmoid(torch.matmul(X, self.W_xo) +
                            torch.matmul(H, self.W_ho) + self.b_o)
            C_tilde = torch.tanh(torch.matmul(X, self.W_xc) +
                            torch.matmul(H, self.W_hc) + self.b_c)
            C = F * C + I * C_tilde
            H = O * torch.tanh(C)
            outputs.append(H)
        return outputs, (H, C)

    

            

In [11]:
data = LanguageData(batch_size=1024, num_steps=32)
lstm = LSTMScratch(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLMScratch(lstm, vocab_size=len(data.vocab), lr=4)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Epoch 10, Train Loss: 2.7164, Val Loss: 2.6669
Epoch 20, Train Loss: 2.3676, Val Loss: 2.3153
Epoch 30, Train Loss: 2.1864, Val Loss: 2.2178
Epoch 40, Train Loss: 2.0686, Val Loss: 2.1159
Epoch 50, Train Loss: 1.9451, Val Loss: 2.0425
Epoch 60, Train Loss: 1.8471, Val Loss: 1.9886
Epoch 70, Train Loss: 1.7663, Val Loss: 1.9761
Epoch 80, Train Loss: 1.7000, Val Loss: 1.9568
Epoch 90, Train Loss: 1.6386, Val Loss: 1.9564
Epoch 100, Train Loss: 1.5799, Val Loss: 1.9633
Epoch 110, Train Loss: 1.5256, Val Loss: 1.9617
Epoch 120, Train Loss: 1.4861, Val Loss: 1.9840
Epoch 130, Train Loss: 1.4582, Val Loss: 2.0016
Epoch 140, Train Loss: 1.4220, Val Loss: 2.0171
Epoch 150, Train Loss: 1.4072, Val Loss: 2.0085
Epoch 160, Train Loss: 1.3780, Val Loss: 2.0310
Epoch 170, Train Loss: 1.3462, Val Loss: 2.0336
Epoch 180, Train Loss: 1.3206, Val Loss: 2.0814
Epoch 190, Train Loss: 1.3132, Val Loss: 2.0926
Epoch 200, Train Loss: 1.2892, Val Loss: 2.0804


In [13]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has of the time travell'

## Concise Implementation

In [14]:
class LSTM(RNN):
    def __init__(self, num_inputs, num_hiddens):
        nn.Module.__init__(self)
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.rnn = nn.LSTM(num_inputs, num_hiddens)

    def forward(self, inputs, H_C=None):
        return self.rnn(inputs, H_C)

lstm = LSTM(num_inputs=len(data.vocab), num_hiddens=32)

In [15]:
data = LanguageData(batch_size=1024, num_steps=32)
model = RNNLM(lstm, vocab_size=len(data.vocab), lr=4)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

Epoch 10, Train Loss: 2.3893, Val Loss: 2.3807
Epoch 20, Train Loss: 2.1294, Val Loss: 2.1925
Epoch 30, Train Loss: 1.9624, Val Loss: 2.0632
Epoch 40, Train Loss: 1.8592, Val Loss: 2.0350
Epoch 50, Train Loss: 1.7515, Val Loss: 1.9708
Epoch 60, Train Loss: 1.6922, Val Loss: 1.9643
Epoch 70, Train Loss: 1.6352, Val Loss: 1.9695
Epoch 80, Train Loss: 1.5637, Val Loss: 1.9927
Epoch 90, Train Loss: 1.5226, Val Loss: 1.9670
Epoch 100, Train Loss: 1.4815, Val Loss: 2.0110
Epoch 110, Train Loss: 1.4593, Val Loss: 2.0170
Epoch 120, Train Loss: 1.4348, Val Loss: 2.0462
Epoch 130, Train Loss: 1.4013, Val Loss: 2.0554
Epoch 140, Train Loss: 1.3745, Val Loss: 2.0369
Epoch 150, Train Loss: 1.3465, Val Loss: 2.0517
Epoch 160, Train Loss: 1.3358, Val Loss: 2.0900
Epoch 170, Train Loss: 1.2994, Val Loss: 2.1052
Epoch 180, Train Loss: 1.2951, Val Loss: 2.1229
Epoch 190, Train Loss: 1.2816, Val Loss: 2.1243
Epoch 200, Train Loss: 1.2655, Val Loss: 2.1604


'it has a moment another a '

# Gated Recurrent Units (GRUs)

## Implementation From Scratch

In [16]:
class GRUScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.sigma = sigma

        init_weight = lambda *shape: nn.Parameter(torch.randn(*shape) * sigma)
        triple = lambda: (init_weight(num_inputs, num_hiddens),
                          init_weight(num_hiddens, num_hiddens),
                          nn.Parameter(torch.zeros(num_hiddens)))
        self.W_xz, self.W_hz, self.b_z = triple()  # Update gate
        self.W_xr, self.W_hr, self.b_r = triple()  # Reset gate
        self.W_xh, self.W_hh, self.b_h = triple()  # Candidate hidden state

    def forward(self, inputs, H=None):
        if H is None:
            H = torch.zeros((inputs.shape[1], self.num_hiddens), device=inputs.device)
        outputs = []
        for X in inputs:
            Z = torch.sigmoid(torch.matmul(X, self.W_xz) +
                              torch.matmul(H, self.W_hz) + self.b_z)
            R = torch.sigmoid(torch.matmul(X, self.W_xr) +
                              torch.matmul(H, self.W_hr) + self.b_r)
            H_tilde = torch.tanh(torch.matmul(X, self.W_xh) +
                                 torch.matmul(R * H, self.W_hh) + self.b_h)
            H = Z * H + (1 - Z) * H_tilde
            outputs.append(H)
        outputs = torch.stack(outputs)  # Stack list into tensor
        return outputs, H

In [17]:
data = LanguageData(batch_size=1024, num_steps=32)
gruscratch = GRUScratch(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLM(gruscratch, vocab_size=len(data.vocab), lr=4)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Epoch 10, Train Loss: 2.2921, Val Loss: 2.2610
Epoch 20, Train Loss: 2.0540, Val Loss: 2.1085
Epoch 30, Train Loss: 1.9041, Val Loss: 2.0376
Epoch 40, Train Loss: 1.7825, Val Loss: 1.9793
Epoch 50, Train Loss: 1.6834, Val Loss: 1.9564
Epoch 60, Train Loss: 1.5717, Val Loss: 1.9509
Epoch 70, Train Loss: 1.5304, Val Loss: 2.0264
Epoch 80, Train Loss: 1.4759, Val Loss: 2.0018
Epoch 90, Train Loss: 1.4211, Val Loss: 2.0265
Epoch 100, Train Loss: 1.4012, Val Loss: 2.0878
Epoch 110, Train Loss: 1.3970, Val Loss: 2.0825
Epoch 120, Train Loss: 1.3642, Val Loss: 2.0705
Epoch 130, Train Loss: 1.3380, Val Loss: 2.0991
Epoch 140, Train Loss: 1.3145, Val Loss: 2.1121
Epoch 150, Train Loss: 1.3068, Val Loss: 2.1344
Epoch 160, Train Loss: 1.3072, Val Loss: 2.1521
Epoch 170, Train Loss: 1.2845, Val Loss: 2.1829
Epoch 180, Train Loss: 1.2893, Val Loss: 2.1811
Epoch 190, Train Loss: 1.2634, Val Loss: 2.1888
Epoch 200, Train Loss: 1.2666, Val Loss: 2.1820


In [18]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has all have a recentio'

## Concise Implementation

In [19]:
class GRU(RNN):
    def __init__(self, num_inputs, num_hiddens):
        nn.Module.__init__(self)
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.rnn = nn.GRU(num_inputs, num_hiddens)
    
    

In [20]:
data = LanguageData(batch_size=1024, num_steps=32)
gru = GRU(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLM(gru, vocab_size=len(data.vocab), lr=4)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Epoch 10, Train Loss: 2.2388, Val Loss: 2.2328
Epoch 20, Train Loss: 2.0230, Val Loss: 2.0880
Epoch 30, Train Loss: 1.8420, Val Loss: 1.9910
Epoch 40, Train Loss: 1.7375, Val Loss: 1.9595
Epoch 50, Train Loss: 1.6353, Val Loss: 1.9155
Epoch 60, Train Loss: 1.5533, Val Loss: 1.9177
Epoch 70, Train Loss: 1.5008, Val Loss: 1.9366
Epoch 80, Train Loss: 1.4671, Val Loss: 1.9608
Epoch 90, Train Loss: 1.4233, Val Loss: 1.9787
Epoch 100, Train Loss: 1.4138, Val Loss: 2.0101
Epoch 110, Train Loss: 1.3929, Val Loss: 2.0419
Epoch 120, Train Loss: 1.3887, Val Loss: 2.0872
Epoch 130, Train Loss: 1.3722, Val Loss: 2.0425
Epoch 140, Train Loss: 1.3333, Val Loss: 2.1189
Epoch 150, Train Loss: 1.3217, Val Loss: 2.0908
Epoch 160, Train Loss: 1.3113, Val Loss: 2.0896
Epoch 170, Train Loss: 1.3023, Val Loss: 2.1237
Epoch 180, Train Loss: 1.2957, Val Loss: 2.1437
Epoch 190, Train Loss: 1.2814, Val Loss: 2.0928
Epoch 200, Train Loss: 1.2770, Val Loss: 2.1259


In [21]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has experimental getter'

# Deep Recurrent Neural Networks

# Implementation From Scratch

In [22]:
class StackedRNNScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, num_layers, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.sigma = sigma
        self.rnns = nn.Sequential(*[RNNScratch(
            num_inputs if i==0 else num_hiddens, num_hiddens, sigma)
                                    for i in range(num_layers)])
    
    def forward(self, inputs, Hs=None):
        outputs = inputs
        if Hs is None: Hs = [None] * self.num_layers
        for i in range(self.num_layers):
            outputs, Hs[i] = self.rnns[i](outputs, Hs[i])
            outputs = torch.stack(outputs, 0)
        return outputs, Hs

In [23]:
data = LanguageData(batch_size=1024, num_steps=32)
rnn_block = StackedRNNScratch(num_inputs=len(data.vocab),
                              num_hiddens=32, num_layers=2)
model = RNNLMScratch(rnn_block, vocab_size=len(data.vocab), lr=2)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Epoch 10, Train Loss: 2.8409, Val Loss: 2.8241
Epoch 20, Train Loss: 2.8343, Val Loss: 2.8146
Epoch 30, Train Loss: 2.5086, Val Loss: 2.4841
Epoch 40, Train Loss: 2.2571, Val Loss: 2.2452
Epoch 50, Train Loss: 2.1329, Val Loss: 2.2005
Epoch 60, Train Loss: 1.9893, Val Loss: 2.0914
Epoch 70, Train Loss: 1.9039, Val Loss: 2.0356
Epoch 80, Train Loss: 1.8207, Val Loss: 2.0229
Epoch 90, Train Loss: 1.7252, Val Loss: 2.0115
Epoch 100, Train Loss: 1.6746, Val Loss: 2.0035
Epoch 110, Train Loss: 1.6355, Val Loss: 2.0068
Epoch 120, Train Loss: 1.5823, Val Loss: 2.0303
Epoch 130, Train Loss: 1.5441, Val Loss: 2.0137
Epoch 140, Train Loss: 1.5182, Val Loss: 2.0255
Epoch 150, Train Loss: 1.5154, Val Loss: 2.0200
Epoch 160, Train Loss: 1.4800, Val Loss: 2.0565
Epoch 170, Train Loss: 1.4756, Val Loss: 2.0670
Epoch 180, Train Loss: 1.4749, Val Loss: 2.0468
Epoch 190, Train Loss: 1.4484, Val Loss: 2.0729
Epoch 200, Train Loss: 1.4666, Val Loss: 2.0859


In [24]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has expect a sigs and a'

# Concise Implementation

In [25]:
class GRU(RNN):  #@save
    """The multilayer GRU model."""
    def __init__(self, num_inputs, num_hiddens, num_layers, dropout=0):
        nn.Module.__init__(self)
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.dropout = dropout
        self.rnn = nn.GRU(num_inputs, num_hiddens, num_layers,
                          dropout=dropout)

In [26]:
data = LanguageData(batch_size=1024, num_steps=32)
gru = GRU(num_inputs=len(data.vocab), num_hiddens=32, num_layers=2)
model = RNNLM(gru, vocab_size=len(data.vocab), lr=2)
train(model, data, nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=model.lr), num_epochs=200)

Epoch 10, Train Loss: 2.6632, Val Loss: 2.6238
Epoch 20, Train Loss: 2.2923, Val Loss: 2.2718
Epoch 30, Train Loss: 2.1271, Val Loss: 2.1554
Epoch 40, Train Loss: 1.9892, Val Loss: 2.0818
Epoch 50, Train Loss: 1.8747, Val Loss: 1.9976
Epoch 60, Train Loss: 1.7470, Val Loss: 1.9572
Epoch 70, Train Loss: 1.6434, Val Loss: 1.9385
Epoch 80, Train Loss: 1.5625, Val Loss: 1.9301
Epoch 90, Train Loss: 1.4876, Val Loss: 1.9630
Epoch 100, Train Loss: 1.4128, Val Loss: 1.9737
Epoch 110, Train Loss: 1.3538, Val Loss: 1.9884
Epoch 120, Train Loss: 1.2944, Val Loss: 2.0166
Epoch 130, Train Loss: 1.2428, Val Loss: 2.0621
Epoch 140, Train Loss: 1.1949, Val Loss: 2.0937
Epoch 150, Train Loss: 1.1767, Val Loss: 2.1665
Epoch 160, Train Loss: 1.1318, Val Loss: 2.1602
Epoch 170, Train Loss: 1.0935, Val Loss: 2.2200
Epoch 180, Train Loss: 1.0633, Val Loss: 2.2401
Epoch 190, Train Loss: 1.0553, Val Loss: 2.2827
Epoch 200, Train Loss: 1.0355, Val Loss: 2.3040


In [27]:
device = next(model.parameters()).device
model.predict('it has', 20, data.vocab, device=device)

'it has it there is howessi'