In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import nltk
import numpy as np
import requests

In [3]:
from nltk.corpus import treebank, brown, conll2000
from sklearn.model_selection import train_test_split

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset

In [5]:
from tensorflow import keras

In [6]:
from tqdm import tqdm

# Part-of-Speech Tagging with a Bidirectional LSTM

In [7]:
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.


True

In [8]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [9]:
# Download all PoS-tagged sentences and place them in one list.
tagged_sentences = treebank.tagged_sents(tagset='universal') +\
                   brown.tagged_sents(tagset='universal') +\
                   conll2000.tagged_sents(tagset='universal')

print(tagged_sentences[0])
print(f"Dataset size: {len(tagged_sentences)}")

[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')]
Dataset size: 72202


In [10]:
sentences, sentence_tags = [], []

for s in tagged_sentences:
  sentence, tags = zip(*s)
  sentences.append(list(sentence))
  sentence_tags.append(list(tags))

The sentences and their respective tags are now in separate lists.

In [11]:
print(sentences[0])
print(sentence_tags[0])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
['NOUN', 'NOUN', '.', 'NUM', 'NOUN', 'ADJ', '.', 'VERB', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'NUM', '.']


In [12]:
print(len(sentences), len(sentence_tags))

72202 72202


In [13]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

x_train, x_test, y_train, y_test = train_test_split(sentences, sentence_tags,
                                                    test_size=1 - train_ratio,
                                                    random_state=1)

x_val, x_test, y_val, y_test = train_test_split(x_test, y_test,
                                                test_size=test_ratio/(test_ratio + validation_ratio),
                                                random_state=1)

In [14]:
print(len(x_train), len(y_train))
print(len(x_val), len(y_val))
print(len(x_test), len(y_test))

54151 54151
10830 10830
7221 7221


In [15]:
sentence_tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<OOV>')

In [16]:
sentence_tokenizer.fit_on_texts(x_train)

In [17]:
print(f"Vocabulary size: {len(sentence_tokenizer.word_index)}")

Vocabulary size: 52041


In [18]:
tag_tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
tag_tokenizer.fit_on_texts(y_train)

In [19]:
print(f"Number of PoS tags: {len(tag_tokenizer.word_index)}\n")
tag_tokenizer.get_config()

Number of PoS tags: 13



{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': '<OOV>',
 'document_count': 54151,
 'word_counts': '{"det": 126968, "verb": 174593, "adj": 80523, "adp": 136453, "noun": 286676, "adv": 51205, ".": 142935, "pron": 44684, "conj": 35060, "num": 21461, "prt": 31229, "x": 6090}',
 'word_docs': '{"verb": 50837, "noun": 51171, "adv": 29531, "adj": 36344, ".": 53332, "det": 44747, "adp": 43855, "conj": 24383, "pron": 26965, "num": 11964, "prt": 21777, "x": 2682}',
 'index_docs': '{"3": 50837, "2": 51171, "8": 29531, "7": 36344, "4": 53332, "6": 44747, "5": 43855, "10": 24383, "9": 26965, "12": 11964, "11": 21777, "13": 2682}',
 'index_word': '{"1": "<OOV>", "2": "noun", "3": "verb", "4": ".", "5": "adp", "6": "det", "7": "adj", "8": "adv", "9": "pron", "10": "conj", "11": "prt", "12": "num", "13": "x"}',
 'word_index': '{"<OOV>": 1, "noun": 2, "verb": 3, ".": 4, "adp": 5, "det": 6, "adj": 7, "adv": 8, "pr

In [20]:
# The set of universal PoS tags.
tag_tokenizer.word_index

{'<OOV>': 1,
 'noun': 2,
 'verb': 3,
 '.': 4,
 'adp': 5,
 'det': 6,
 'adj': 7,
 'adv': 8,
 'pron': 9,
 'conj': 10,
 'prt': 11,
 'num': 12,
 'x': 13}

In [21]:
x_train_seqs = sentence_tokenizer.texts_to_sequences(x_train)

In [22]:
print(x_train_seqs[0])

[27, 86, 21, 479, 7, 2, 920, 10903, 20547, 3327, 5644, 337, 4]


In [23]:
print(f"Original: {x_train[0]}")
print(f"Reconstructed: {sentence_tokenizer.sequences_to_texts([x_train_seqs[0]])}")

Original: ['This', 'may', 'be', 'due', 'to', 'the', 'heavy', 'interlobular', 'connective', 'tissue', 'barriers', 'present', '.']
Reconstructed: ['this may be due to the heavy interlobular connective tissue barriers present .']


Next, we'll vectorize the labels (i.e. sequences of PoS tags) using its respective tokenizer.

In [24]:
y_train_seqs = tag_tokenizer.texts_to_sequences(y_train)

In [25]:
y_train_seqs[0]

[6, 3, 3, 7, 5, 6, 7, 7, 7, 2, 2, 8, 4]

In [26]:
tag_tokenizer.sequences_to_texts([y_train_seqs[0]])

['det verb verb adj adp det adj adj adj noun noun adv .']

Finally, we'll do the same with the validation inputs and labels.

In [27]:
x_val_seqs = sentence_tokenizer.texts_to_sequences(x_val)
y_val_seqs = tag_tokenizer.texts_to_sequences(y_val)

In [28]:
MAX_LENGTH = len(max(x_train_seqs, key=len))
print(f"Length of longest input sequence: {MAX_LENGTH}")

Length of longest input sequence: 161


In [29]:
x_train_padded = keras.preprocessing.sequence.pad_sequences(x_train_seqs, padding='post',
                                                            maxlen=MAX_LENGTH)

In [30]:
print(x_train_padded[0])

[   27    86    21   479     7     2   920 10903 20547  3327  5644   337
     4     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0]


We'll do the same with the training label (PoS sequences)...

In [31]:
y_train_padded = keras.preprocessing.sequence.pad_sequences(y_train_seqs, padding='post',
                                                            maxlen=MAX_LENGTH)

...and the validation dataset.

In [32]:
x_val_padded = keras.preprocessing.sequence.pad_sequences(x_val_seqs, padding='post', maxlen=MAX_LENGTH)
y_val_padded = keras.preprocessing.sequence.pad_sequences(y_val_seqs, padding='post', maxlen=MAX_LENGTH)

In [33]:
# Convert to PyTorch tensors
x_train_padded = torch.tensor(x_train_padded, dtype=torch.long)
y_train_padded = torch.tensor(y_train_padded, dtype=torch.long)

x_val_padded = torch.tensor(x_val_padded, dtype=torch.long)
y_val_padded = torch.tensor(y_val_padded, dtype=torch.long)

In [34]:
class TextPosDataset(Dataset):
    def __init__(self, texts, pos_tags):
        self.texts = texts
        self.pos_tags = pos_tags

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.pos_tags[idx]

In [35]:
# Create Dataset
train_dataset = TextPosDataset(x_train_padded, y_train_padded)
val_dataset = TextPosDataset(x_val_padded, y_val_padded)

In [36]:
# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

In [37]:
# For the embedding layer. "+ 1" to account for the padding token.
num_tokens = len(sentence_tokenizer.word_index) + 1
embedding_dim = 128
hidden_dim = 128

# For the output layer. The number of classes corresponds to the
# number of possible tags.
num_classes = len(tag_tokenizer.word_index) + 1

In [38]:
# Set random seeds for reproducibility
torch.manual_seed(0)
np.random.seed(0)

In [39]:
class BidirectionalLSTMModel(nn.Module):
    def __init__(self, num_tokens, embedding_dim, hidden_dim, num_classes):
        super(BidirectionalLSTMModel, self).__init__()
        self.embedding = nn.Embedding(num_tokens, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

# Instantiate the model
model = BidirectionalLSTMModel(num_tokens, embedding_dim, hidden_dim, num_classes)


In [40]:
model

BidirectionalLSTMModel(
  (embedding): Embedding(52042, 128, padding_idx=0)
  (lstm): LSTM(128, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=14, bias=True)
)

A few notes about the model summary:<br>

The embedding layer **output** has three dimensions:
- Batch size (it's showing as "None" because we didn't specify it upfront. We'll do it when we call *model.fit*).
- Sequence length (the sequences are all the same length now after our padding step).
- Embedding dimension.
<br><br>

The LSTM outputs a vector *twice* the size of what we specified because it's bidirectional. Recall from the slides that the outputs from the two LSTMs will be concatenated before going to the output layer.
<br><br>

The final layer's **output** also has three dimensions:
- Batch size
- Sequence length
- Output dimension (the number of possible tags).

The output will be a **sequence of probability distributions** for each input sequence. One probability distribution per tag.



In [41]:
# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters())

In [42]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BidirectionalLSTMModel(
  (embedding): Embedding(52042, 128, padding_idx=0)
  (lstm): LSTM(128, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=14, bias=True)
)

In [43]:
# Training and Validation Loop
num_epochs = 10

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0
    train_loader_tqdm = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs} Training')
    for inputs, targets in train_loader_tqdm:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.view(-1, num_classes)  # Flatten for loss computation
        targets = targets.view(-1)  # Flatten for loss computation
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
        train_loader_tqdm.set_postfix(loss=train_loss / len(train_loader.dataset))
    train_loss /= len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        val_loader_tqdm = tqdm(val_loader, desc=f'Epoch {epoch+1}/{num_epochs} Validation')
        for inputs, targets in val_loader_tqdm:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            outputs = outputs.view(-1, num_classes)
            targets = targets.view(-1)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * inputs.size(0)

            _, predicted = torch.max(outputs, 1)
            mask = targets != 0  # Ignore padding tokens
            correct += (predicted[mask] == targets[mask]).sum().item()
            total += mask.sum().item()
            val_loader_tqdm.set_postfix(loss=val_loss / len(val_loader.dataset))
    val_loss /= len(val_loader.dataset)
    val_accuracy = correct / total

    # Print training and validation loss and accuracy
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}\n')


Epoch 1/10 Training: 100%|██████████| 212/212 [00:11<00:00, 18.50it/s, loss=0.781]
Epoch 1/10 Validation: 100%|██████████| 43/43 [00:01<00:00, 36.68it/s, loss=0.334]


Epoch 1/10, Training Loss: 0.7812, Validation Loss: 0.3343, Validation Accuracy: 0.8925



Epoch 2/10 Training: 100%|██████████| 212/212 [00:08<00:00, 23.80it/s, loss=0.26]
Epoch 2/10 Validation: 100%|██████████| 43/43 [00:00<00:00, 59.95it/s, loss=0.208]


Epoch 2/10, Training Loss: 0.2603, Validation Loss: 0.2084, Validation Accuracy: 0.9336



Epoch 3/10 Training: 100%|██████████| 212/212 [00:08<00:00, 24.80it/s, loss=0.175]
Epoch 3/10 Validation: 100%|██████████| 43/43 [00:00<00:00, 59.92it/s, loss=0.161]


Epoch 3/10, Training Loss: 0.1747, Validation Loss: 0.1605, Validation Accuracy: 0.9482



Epoch 4/10 Training: 100%|██████████| 212/212 [00:08<00:00, 26.26it/s, loss=0.132]
Epoch 4/10 Validation: 100%|██████████| 43/43 [00:00<00:00, 59.35it/s, loss=0.137]


Epoch 4/10, Training Loss: 0.1322, Validation Loss: 0.1370, Validation Accuracy: 0.9558



Epoch 5/10 Training: 100%|██████████| 212/212 [00:08<00:00, 25.19it/s, loss=0.105]
Epoch 5/10 Validation: 100%|██████████| 43/43 [00:00<00:00, 59.84it/s, loss=0.123]


Epoch 5/10, Training Loss: 0.1051, Validation Loss: 0.1225, Validation Accuracy: 0.9600



Epoch 6/10 Training: 100%|██████████| 212/212 [00:08<00:00, 25.33it/s, loss=0.0852]
Epoch 6/10 Validation: 100%|██████████| 43/43 [00:00<00:00, 52.85it/s, loss=0.115]


Epoch 6/10, Training Loss: 0.0852, Validation Loss: 0.1146, Validation Accuracy: 0.9628



Epoch 7/10 Training: 100%|██████████| 212/212 [00:08<00:00, 24.87it/s, loss=0.0696]
Epoch 7/10 Validation: 100%|██████████| 43/43 [00:00<00:00, 58.46it/s, loss=0.109]


Epoch 7/10, Training Loss: 0.0696, Validation Loss: 0.1093, Validation Accuracy: 0.9649



Epoch 8/10 Training: 100%|██████████| 212/212 [00:08<00:00, 24.71it/s, loss=0.057]
Epoch 8/10 Validation: 100%|██████████| 43/43 [00:00<00:00, 57.91it/s, loss=0.108]


Epoch 8/10, Training Loss: 0.0570, Validation Loss: 0.1076, Validation Accuracy: 0.9656



Epoch 9/10 Training: 100%|██████████| 212/212 [00:08<00:00, 23.83it/s, loss=0.0468]
Epoch 9/10 Validation: 100%|██████████| 43/43 [00:00<00:00, 54.15it/s, loss=0.107]


Epoch 9/10, Training Loss: 0.0468, Validation Loss: 0.1070, Validation Accuracy: 0.9662



Epoch 10/10 Training: 100%|██████████| 212/212 [00:08<00:00, 23.60it/s, loss=0.0383]
Epoch 10/10 Validation: 100%|██████████| 43/43 [00:00<00:00, 57.86it/s, loss=0.109]


Epoch 10/10, Training Loss: 0.0383, Validation Loss: 0.1095, Validation Accuracy: 0.9663



In [44]:
# Save the model
torch.save(model.state_dict(), '/content/drive/MyDrive/RNN model/bidirectional_lstm.pth')

### Evaluation

In [45]:
# Preprocess the test data and test the model.
x_test_seqs = sentence_tokenizer.texts_to_sequences(x_test)
x_test_padded = keras.preprocessing.sequence.pad_sequences(x_test_seqs, padding='post', maxlen=MAX_LENGTH)

y_test_seqs = tag_tokenizer.texts_to_sequences(y_test)
y_test_padded = keras.preprocessing.sequence.pad_sequences(y_test_seqs, padding='post', maxlen=MAX_LENGTH)

In [46]:
# Convert to Pytorch tensor
x_test_padded = torch.tensor(x_test_padded, dtype=torch.long)
y_test_padded = torch.tensor(y_test_padded, dtype=torch.long)

In [47]:
# Create Dataset
test_dataset = TextPosDataset(x_test_padded, y_test_padded)

In [48]:
# Create DataLoader
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [49]:
# Set the model to evaluation mode
model.eval()

test_loss = 0
correct = 0
total = 0

with torch.no_grad():
    for inputs, targets in tqdm(test_loader, desc='Evaluating'):
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        outputs = outputs.view(-1, num_classes)
        targets = targets.view(-1)

        loss = criterion(outputs, targets)
        test_loss += loss.item() * inputs.size(0)

        _, predicted = torch.max(outputs, 1)
        mask = targets != 0  # Ignore padding tokens
        correct += (predicted[mask] == targets[mask]).sum().item()
        total += mask.sum().item()

test_loss /= len(test_loader.dataset)
test_accuracy = correct / total

print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')


Evaluating: 100%|██████████| 29/29 [00:00<00:00, 48.44it/s]

Test Loss: 0.1130, Test Accuracy: 0.9660





We can now use our model to tag sentences.

In [50]:
samples = [
    "Brown refused to testify.",
    "Brown sofas are on sale.",
]

The function below takes a list of strings, tokenizes and pads them, then has the model tag them. Note that if a sentence is longer than MAX_LENGTH, it'll be truncated.

In [51]:
def tag_sentences(sentences):
  sentences_seqs = sentence_tokenizer.texts_to_sequences(sentences)
  sentences_padded = keras.preprocessing.sequence.pad_sequences(sentences_seqs,
                                                                maxlen=MAX_LENGTH,
                                                                padding='post')

  # Convert to PyTorch tensor
  sentences_padded_tensor = torch.tensor(sentences_padded, dtype=torch.long).to(device)

  # Set the model to evaluation mode
  model.eval()

  # Get the model predictions
  with torch.no_grad():
      tag_preds = model(sentences_padded_tensor)

  # Convert predictions to POS tags
  sentence_tags = []
  for i, preds in enumerate(tag_preds):
    preds = preds.cpu().numpy()  # Move to CPU and convert to numpy array

    # Extract the most probable tag from EACH probability distribution.
    # Note how we're extracting tags for only the non-padding tokens.
    tags_seq = [np.argmax(p) for p in preds[:len(sentences_seqs[i])]]

    # Convert the sentence and tag sequences back to their token counterparts.
    words = [sentence_tokenizer.index_word[w] for w in sentences_seqs[i]]
    tags = [tag_tokenizer.index_word[t] for t in tags_seq]
    sentence_tags.append(list(zip(words, tags)))

  return sentence_tags


In [52]:
tagged_sample_sentences = tag_sentences(samples)

In [53]:
print(tagged_sample_sentences[0])

[('brown', 'noun'), ('refused', 'verb'), ('to', 'prt'), ('testify', 'verb')]


In [54]:
print(tagged_sample_sentences[1])

[('brown', 'adj'), ('sofas', 'noun'), ('are', 'verb'), ('on', 'adp'), ('sale', 'noun')]


# Language Modelling With Stacked LSTMs

We'll build a language model trained on the *Art of War* by Sun Tzu.

In [7]:
# Open the file in read mode
with open('/content/drive/MyDrive/RNN model/art_of_war.txt', 'r') as file:
    # Read the entire content of the file
    art_of_war = file.read()
    print(art_of_war)


1. Sun Tzŭ said: The art of war is of vital importance to the State.

2. It is a matter of life and death, a road either to safety or to
ruin. Hence it is a subject of inquiry which can on no account be
neglected.

3. The art of war, then, is governed by five constant factors, to be
taken into account in one’s deliberations, when seeking to determine
the conditions obtaining in the field.

4. These are: (1) The Moral Law; (2) Heaven; (3) Earth; (4) The
Commander; (5) Method and discipline.

5, 6. _The Moral Law_ causes the people to be in complete accord with
their ruler, so that they will follow him regardless of their lives,
undismayed by any danger.

7. Heaven signifies night and day, cold and heat, times and seasons.

8. Earth comprises distances, great and small; danger and security;
open ground and narrow passes; the chances of life and death.

9. The Commander stands for the virtues of wisdom, sincerity,
benevolence, courage and strictness.

10. By _Method and discipline_ are to

In [8]:
art_of_war[:300]

'1. Sun Tzŭ said: The art of war is of vital importance to the State.\n\n2. It is a matter of life and death, a road either to safety or to\nruin. Hence it is a subject of inquiry which can on no account be\nneglected.\n\n3. The art of war, then, is governed by five constant factors, to be\ntaken into accou'

In [9]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)

In [10]:
tokenizer.fit_on_texts([art_of_war])

The tokenizer's internal dictionary now maps characters rather than words...

In [11]:
tokenizer.get_config()

{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': True,
 'oov_token': None,
 'document_count': 1,
 'word_counts': '{"1": 179, ".": 896, " ": 9794, "s": 3081, "u": 1467, "n": 3565, "t": 4398, "z": 20, "\\u016d": 13, "a": 3475, "i": 3573, "d": 1681, ":": 48, "h": 2558, "e": 5837, "r": 2776, "o": 3548, "f": 1238, "w": 981, "v": 478, "l": 1722, "m": 1201, "p": 769, "c": 1390, "\\n": 1443, "2": 127, ",": 634, "y": 1055, "b": 708, "j": 23, "q": 55, "g": 1007, "3": 87, "k": 345, "\\u2019": 57, "4": 66, "(": 59, ")": 59, ";": 168, "5": 58, "6": 51, "_": 62, "7": 39, "8": 36, "9": 34, "0": 38, "x": 49, "\\u2014": 16, "?": 8, "!": 8, "-": 57, "\\u201c": 3, "\\u201d": 3, "\\u0153": 7, "\\u00fc": 3, "\\u2018": 1}',
 'word_docs': '{".": 1, "i": 1, "8": 1, "3": 1, "b": 1, "1": 1, "h": 1, "7": 1, "c": 1, "t": 1, "5": 1, "x": 1, "\\u00fc": 1, "-": 1, "s": 1, "!": 1, "6": 1, ";": 1, "_": 1, "z": 1, "r": 1, " ": 1, "y": 1, "\\u2019": 1,

...and the resulting possibility space is much smaller.

In [12]:
print(f"Tokenizer \"Vocabulary\" size: {len(tokenizer.word_index)}")

Tokenizer "Vocabulary" size: 56


As we did when building the PoS tagger, we'll vectorize the book's characters into a sequence of integers, each integer mapping to a particular character.

In [13]:
seq = tokenizer.texts_to_sequences([art_of_war])[0]

In [14]:
print(f"Text length: {len(seq)}")

Text length: 61054


In [15]:
# Sanity check.
tokenizer.sequences_to_texts([seq[:10]])

['1 .   s u n   t z ŭ']

In [16]:
num_tokens = len(tokenizer.word_index) + 1
num_tokens

57

In [17]:
# Create input sequences
input_timesteps = 100
window_size = input_timesteps + 1

Here, we're creating windows of `input_timesteps + 1`. The *input_timesteps* represents our training example length. The *+1* is there to help us create the target/label for each training example. This will be clarified further below.<br><br>
In addition, we're setting *shift* to 1. This means we'll get overlapping windows shifted by 1. e.g. if the input is [1, 2, 3, 4, ...]. The first window will contain [1, 2, 3, ...], the second window will contain [2, 3, 4, ...] and so on. This is so we can have more training examples.<br><br>
Finally, we're setting *drop_remainder* to True which ensures ALL windows contain exactly N elements. i.e. once the input contains fewer than N elements, they are ignored.

In [18]:
# Create windows
windows = []
for i in range(len(seq) - window_size):
    windows.append(seq[i:i + window_size])

# Convert windows to tensor
windows = torch.tensor(windows, dtype=torch.long)

Looking at the first few windows, we can see they're all the same length and that each subsequent window is shifted over by 1. Our corpus has now been divided into segments of length `input_timesteps + 1`.

In [19]:
for i in range(3):
    window = windows[i]
    print(len(window), window.tolist())

101 [27, 21, 1, 8, 13, 5, 1, 3, 47, 49, 1, 8, 7, 4, 12, 41, 1, 3, 10, 2, 1, 7, 9, 3, 1, 6, 16, 1, 20, 7, 9, 1, 4, 8, 1, 6, 16, 1, 25, 4, 3, 7, 11, 1, 4, 17, 22, 6, 9, 3, 7, 5, 15, 2, 1, 3, 6, 1, 3, 10, 2, 1, 8, 3, 7, 3, 2, 21, 14, 14, 29, 21, 1, 4, 3, 1, 4, 8, 1, 7, 1, 17, 7, 3, 3, 2, 9, 1, 6, 16, 1, 11, 4, 16, 2, 1, 7, 5, 12, 1, 12]
101 [21, 1, 8, 13, 5, 1, 3, 47, 49, 1, 8, 7, 4, 12, 41, 1, 3, 10, 2, 1, 7, 9, 3, 1, 6, 16, 1, 20, 7, 9, 1, 4, 8, 1, 6, 16, 1, 25, 4, 3, 7, 11, 1, 4, 17, 22, 6, 9, 3, 7, 5, 15, 2, 1, 3, 6, 1, 3, 10, 2, 1, 8, 3, 7, 3, 2, 21, 14, 14, 29, 21, 1, 4, 3, 1, 4, 8, 1, 7, 1, 17, 7, 3, 3, 2, 9, 1, 6, 16, 1, 11, 4, 16, 2, 1, 7, 5, 12, 1, 12, 2]
101 [1, 8, 13, 5, 1, 3, 47, 49, 1, 8, 7, 4, 12, 41, 1, 3, 10, 2, 1, 7, 9, 3, 1, 6, 16, 1, 20, 7, 9, 1, 4, 8, 1, 6, 16, 1, 25, 4, 3, 7, 11, 1, 4, 17, 22, 6, 9, 3, 7, 5, 15, 2, 1, 3, 6, 1, 3, 10, 2, 1, 8, 3, 7, 3, 2, 21, 14, 14, 29, 21, 1, 4, 3, 1, 4, 8, 1, 7, 1, 17, 7, 3, 3, 2, 9, 1, 6, 16, 1, 11, 4, 16, 2, 1, 7, 5, 12, 1, 12, 2

The *window* method returns a nested dataset of datasets (i.e. each window is a dataset containing a tensor).

In [20]:
print(windows, '\n')

tensor([[27, 21,  1,  ..., 12,  1, 12],
        [21,  1,  8,  ...,  1, 12,  2],
        [ 1,  8, 13,  ..., 12,  2,  7],
        ...,
        [ 1,  9,  2,  ..., 17,  6, 25],
        [ 9,  2,  8,  ...,  6, 25,  2],
        [ 2,  8, 13,  ..., 25,  2, 21]]) 



We now have a single dataset of tensors, where each tensor is `input_timesteps+1` long and shifted by 1.

We can now separate each example into an input sequence(x) and a corresponding label/target sequence(y).<br><br>

**Teacher Forcing** where:<br>
1. At each timestep during training, the output is compared to a label.
2. At the next timestep, rather than feeding the model the previous output, we feed it the next character of the input sequence (i.e. what the model should've outputted).
<br><br>

This is why each sequence is of size *input_timesteps + 1*. Each sequence is now going to be separated into TWO sequences. The first sequence will be the training input and will be of length *input_timesteps* (i.e. everything but the LAST character). The second sequence will be the label/target and will consist of all the sequence elements shifted by 1 (i.e. everything but the FIRST character).<br><br>

So if a sequence is "she swam in the lake", then:
- The input will be "she swam in the lak" (drop the last character)
- The target/label will be "he swam in the lake" (drop the first character)

In [21]:
class TextDataset(Dataset):
    def __init__(self, windows):
        self.windows = windows

    def __len__(self):
        return len(self.windows)

    def __getitem__(self, idx):
        window = self.windows[idx]
        return window[:-1], window[1:]

In [22]:
# Create the dataset
text_dataset = TextDataset(windows)

In [23]:
# Create the DataLoader
batch_size = 32
train_loader = DataLoader(text_dataset, batch_size=batch_size, shuffle=True)

In [24]:
# Check the dataset
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)
    break

torch.Size([32, 100]) torch.Size([32, 100])


Each batch now consists of a set of input sequences and a corresponding set of label/target sequences, with the labels/targets shifted by 1.

In [25]:
for inputs, targets in train_loader:
    print(inputs)
    print(targets)
    break

tensor([[ 6, 13,  9,  ...,  8,  3,  9],
        [19,  2,  5,  ..., 14, 30, 35],
        [ 5,  3,  2,  ...,  3,  2,  1],
        ...,
        [ 1,  4,  3,  ...,  2,  8,  1],
        [ 7,  5,  4,  ..., 25,  4, 15],
        [27, 29, 21,  ..., 12,  4,  5]])
tensor([[13,  9,  8,  ...,  3,  9,  4],
        [ 2,  5,  2,  ..., 30, 35, 21],
        [ 3,  2,  9,  ...,  2,  1, 19],
        ...,
        [ 4,  3, 14,  ...,  8,  1,  6],
        [ 5,  4,  3,  ...,  4, 15,  3],
        [29, 21,  1,  ...,  4,  5, 19]])


The last step before we can build our model is to one-hot encode the **inputs**. We're doing this because:
1. We're not using embeddings for the input. We can, but since this is a character model with just a few dozen possible choices, we can get away with one-hot encoding. There's also no reason to think a particular letter should be closer to another in vector space as we would want in a word-level model.

2. Since we're not using embeddings and our input is categorical, we need to one-hot encode.

Note that despite our labels ALSO being categorical, we are NOT one-hot encoding them this time. This is because we'll be using a loss function that can help us skip that step (more below).

In [26]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [27]:
class CharLSTMModel(nn.Module):
    def __init__(self, num_tokens, hidden_dim):
        super(CharLSTMModel, self).__init__()
        self.lstm1 = nn.LSTM(num_tokens, hidden_dim, batch_first=True, dropout=0.2)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_dim, num_tokens)
        self.softmax = nn.LogSoftmax(dim=2)  # Use LogSoftmax for stability with NLLLoss

    def forward(self, x):
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x = self.fc(x)
        x = self.softmax(x)
        return x

num_tokens = len(tokenizer.word_index) + 1
hidden_dim = 128

# Initialize model
model = CharLSTMModel(num_tokens, hidden_dim).to(device)



In [28]:
model

CharLSTMModel(
  (lstm1): LSTM(57, 128, batch_first=True, dropout=0.2)
  (lstm2): LSTM(128, 128, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=128, out_features=57, bias=True)
  (softmax): LogSoftmax(dim=2)
)

In [29]:
# Initialize criterion and optimizer
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters())

In [30]:
# Training loop
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_loader_tqdm = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs} Training')
    for inputs, targets in train_loader_tqdm:
        inputs, targets = inputs.to(device), targets.to(device)

        # One-hot encode inputs
        inputs_one_hot = nn.functional.one_hot(inputs, num_classes=num_tokens).float()

        optimizer.zero_grad()
        outputs = model(inputs_one_hot)
        outputs = outputs.view(-1, num_tokens)  # Flatten for loss computation
        targets = targets.view(-1)  # Flatten for loss computation
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
        train_loader_tqdm.set_postfix(loss=train_loss / len(train_loader.dataset))

    train_loss /= len(train_loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}\n')

Epoch 1/50 Training: 100%|██████████| 1905/1905 [00:14<00:00, 127.43it/s, loss=2.2]


Epoch 1/50, Training Loss: 2.2037



Epoch 2/50 Training: 100%|██████████| 1905/1905 [00:14<00:00, 135.87it/s, loss=1.63]


Epoch 2/50, Training Loss: 1.6274



Epoch 3/50 Training: 100%|██████████| 1905/1905 [00:14<00:00, 127.62it/s, loss=1.35]


Epoch 3/50, Training Loss: 1.3472



Epoch 4/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 137.35it/s, loss=1.1]


Epoch 4/50, Training Loss: 1.1042



Epoch 5/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 137.21it/s, loss=0.862]


Epoch 5/50, Training Loss: 0.8621



Epoch 6/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 137.16it/s, loss=0.645]


Epoch 6/50, Training Loss: 0.6446



Epoch 7/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 137.57it/s, loss=0.482]


Epoch 7/50, Training Loss: 0.4821



Epoch 8/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 138.11it/s, loss=0.375]


Epoch 8/50, Training Loss: 0.3751



Epoch 9/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 142.20it/s, loss=0.309]


Epoch 9/50, Training Loss: 0.3088



Epoch 10/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 142.05it/s, loss=0.269]


Epoch 10/50, Training Loss: 0.2689



Epoch 11/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 139.39it/s, loss=0.244]


Epoch 11/50, Training Loss: 0.2438



Epoch 12/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 138.47it/s, loss=0.227]


Epoch 12/50, Training Loss: 0.2268



Epoch 13/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 137.31it/s, loss=0.215]


Epoch 13/50, Training Loss: 0.2148



Epoch 14/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 138.70it/s, loss=0.205]


Epoch 14/50, Training Loss: 0.2050



Epoch 15/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 138.91it/s, loss=0.198]


Epoch 15/50, Training Loss: 0.1976



Epoch 16/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 138.87it/s, loss=0.191]


Epoch 16/50, Training Loss: 0.1914



Epoch 17/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 136.69it/s, loss=0.186]


Epoch 17/50, Training Loss: 0.1859



Epoch 18/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 141.36it/s, loss=0.181]


Epoch 18/50, Training Loss: 0.1814



Epoch 19/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 139.86it/s, loss=0.177]


Epoch 19/50, Training Loss: 0.1774



Epoch 20/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 138.90it/s, loss=0.174]


Epoch 20/50, Training Loss: 0.1737



Epoch 21/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 141.16it/s, loss=0.171]


Epoch 21/50, Training Loss: 0.1705



Epoch 22/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 137.56it/s, loss=0.167]


Epoch 22/50, Training Loss: 0.1674



Epoch 23/50 Training: 100%|██████████| 1905/1905 [00:14<00:00, 135.30it/s, loss=0.165]


Epoch 23/50, Training Loss: 0.1652



Epoch 24/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 137.26it/s, loss=0.163]


Epoch 24/50, Training Loss: 0.1626



Epoch 25/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 138.52it/s, loss=0.161]


Epoch 25/50, Training Loss: 0.1606



Epoch 26/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 136.72it/s, loss=0.158]


Epoch 26/50, Training Loss: 0.1583



Epoch 27/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 138.33it/s, loss=0.157]


Epoch 27/50, Training Loss: 0.1567



Epoch 28/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 140.41it/s, loss=0.155]


Epoch 28/50, Training Loss: 0.1547



Epoch 29/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 140.12it/s, loss=0.153]


Epoch 29/50, Training Loss: 0.1533



Epoch 30/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 141.12it/s, loss=0.152]


Epoch 30/50, Training Loss: 0.1516



Epoch 31/50 Training: 100%|██████████| 1905/1905 [00:14<00:00, 135.68it/s, loss=0.15]


Epoch 31/50, Training Loss: 0.1504



Epoch 32/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 138.34it/s, loss=0.149]


Epoch 32/50, Training Loss: 0.1490



Epoch 33/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 137.84it/s, loss=0.148]


Epoch 33/50, Training Loss: 0.1476



Epoch 34/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 136.49it/s, loss=0.146]


Epoch 34/50, Training Loss: 0.1464



Epoch 35/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 138.21it/s, loss=0.145]


Epoch 35/50, Training Loss: 0.1455



Epoch 36/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 136.27it/s, loss=0.144]


Epoch 36/50, Training Loss: 0.1445



Epoch 37/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 138.91it/s, loss=0.143]


Epoch 37/50, Training Loss: 0.1433



Epoch 38/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 140.45it/s, loss=0.142]


Epoch 38/50, Training Loss: 0.1422



Epoch 39/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 141.49it/s, loss=0.142]


Epoch 39/50, Training Loss: 0.1415



Epoch 40/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 137.80it/s, loss=0.141]


Epoch 40/50, Training Loss: 0.1405



Epoch 41/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 136.95it/s, loss=0.14]


Epoch 41/50, Training Loss: 0.1397



Epoch 42/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 136.78it/s, loss=0.139]


Epoch 42/50, Training Loss: 0.1389



Epoch 43/50 Training: 100%|██████████| 1905/1905 [00:14<00:00, 134.89it/s, loss=0.138]


Epoch 43/50, Training Loss: 0.1381



Epoch 44/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 136.38it/s, loss=0.137]


Epoch 44/50, Training Loss: 0.1374



Epoch 45/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 137.78it/s, loss=0.137]


Epoch 45/50, Training Loss: 0.1366



Epoch 46/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 137.99it/s, loss=0.136]


Epoch 46/50, Training Loss: 0.1360



Epoch 47/50 Training: 100%|██████████| 1905/1905 [00:13<00:00, 140.06it/s, loss=0.135]


Epoch 47/50, Training Loss: 0.1353



Epoch 48/50 Training: 100%|██████████| 1905/1905 [00:14<00:00, 135.46it/s, loss=0.135]


Epoch 48/50, Training Loss: 0.1347



Epoch 49/50 Training: 100%|██████████| 1905/1905 [00:14<00:00, 135.85it/s, loss=0.134]


Epoch 49/50, Training Loss: 0.1341



Epoch 50/50 Training: 100%|██████████| 1905/1905 [00:14<00:00, 135.05it/s, loss=0.134]

Epoch 50/50, Training Loss: 0.1336






In [31]:
# Save the model
torch.save(model.state_dict(), '/content/drive/MyDrive/RNN model/language_modelling_lstm.pth')

Now that we have a trained model, let's generate some text.<br><br>
The function below takes some seed text and uses that to generate a certain number of characters. For each character, it uses the generated text so far as the input. It's not the most efficient function but it'll work here.<br><br>
There's also a *temperature* parameter. The next character is picked from a probability distribution. By dividing the log of this distribution by *temperature*, we can influence the randomness of the output.<br><br>
When the temperature is low (< 1), the probability distribution sharpens and the model will be more strict in recreating the original text. As we raise the temperature, the distribution flattens and there's a higher chance the model picks something unexpected, resulting in greater surprise in the output. In practice, a high enough temperature will result in nonsense.

In [32]:
def generate_text(model, tokenizer, seed_text, num_chars=200, temperature=1.0):
    model.eval()  # Set the model to evaluation mode
    text = seed_text

    for _ in range(num_chars):
        # Take the last *input_timesteps* number of characters in the text so far
        input_text = text[-input_timesteps:]
        input_seq = tokenizer.texts_to_sequences([input_text])
        input_tensor = torch.tensor(input_seq, dtype=torch.long).to(device)

        # One-hot encode the input tensor
        input_tensor = torch.nn.functional.one_hot(input_tensor, num_classes=num_tokens).float().to(device)

        # Create probability distribution for next character adjusted by temperature
        with torch.no_grad():
            preds = model(input_tensor)  # Forward pass
            preds = preds[0, -1, :]  # Get predictions for the last timestep
            preds = preds / temperature  # Apply temperature scaling
            preds = torch.nn.functional.softmax(preds, dim=-1)  # Apply softmax to get probabilities

        # Sample next character
        next_char_idx = torch.multinomial(preds, 1).item()

        # Convert index to character using tokenizer
        index_word = tokenizer.index_word
        next_char = index_word.get(next_char_idx, '')  # Use default empty string if index is not found

        # Add next character to running text
        text += next_char

    return text

In [33]:
%%time
print(generate_text(model, tokenizer, "Banana peels on the battlefield can", num_chars=300, temperature=0.2))

Banana peels on the battlefield can melies, efteld in their flacks for over them foch revelless.

3. the art of war, then, is governed by five constant factors, to be
taken into account in one’s deliberations, when seeking to determine the
military conditions, let them be made the basis of a comparison, in
this wise:—

13. (1) which 
CPU times: user 638 ms, sys: 50.1 ms, total: 688 ms
Wall time: 1.16 s


In [34]:
print(generate_text(model, tokenizer, "It's time to release the Kraken when", num_chars=300, temperature=0.5))

It's time to release the Kraken when on desiss of the enemy.

11. if we wish to fight, the enemy can be forced to an engagement even
though he be sheltered behind a high rampart and a deep ditch. all we
need do is attack some other place that he will be obliged to relieve.

12. if we do not wish to fight, we can prevent the enemy from


In [35]:
print(generate_text(model, tokenizer, "Crush your enemies, see them driven before you, and", num_chars=300,
                    temperature=1))

Crush your enemies, see them driven before you, and spies_ man, the result things to the effect of excellence.

9. neither is it the acme of excellence if you fight and conquer and
the whole empire seight, but
let your methods be regulated by the infinite variety of circumstances.

29. military tactics more see for his iarsengly; your own men are no


In [36]:
print(generate_text(model, tokenizer, "What is best in life?", num_chars=300, temperature=2))

What is best in life?

14. battl soldrally lashs; and the dire
the _shuai-jan_, iverdle ard cities the common solity fle whom there are five principates
red! in in
an a
crofffy with the moral lyhand oppants; simulated weakness posture them, will have to crefratifyellangr
(or undis
hid. 
31. buse simulated weakness to wi
