In [1]:
import torch
import os
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm

In [19]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
            
    def __len__(self):
        return len(self.word2idx)

In [32]:
class TextProcess(object):
    
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, path, batch_size=20):
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words: 
                    self.dictionary.add_word(word)  
        #Create a 1-D tensor that contains the index of all the words in the file
        rep_tensor = torch.LongTensor(tokens)
        index = 0
        with open(path, 'r') as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    rep_tensor[index] = self.dictionary.word2idx[word]
                    index += 1
        #Find out how many batches we need            
        num_batches = rep_tensor.shape[0] // batch_size     
        print("rep_tensor.shape[0]: ", rep_tensor.shape[0]) #gives the total number of elements (tokens) in the tensor. This represents the total number of words (or tokens) in the dataset.
        print("rep_tensor ", rep_tensor)
        print("batch_size: ", batch_size)
        print("num_batches: ", num_batches)
        #Remove the remainder (Filter out the ones that don't fit)
        rep_tensor = rep_tensor[:num_batches*batch_size]
        # return (batch_size,num_batches)
        rep_tensor = rep_tensor.view(batch_size, -1)
        return rep_tensor

In [33]:
embed_size = 128 # Input features to the LSTM
hidden_size = 1024 # Number of hidden units in the LSTM
num_layers = 1 # Number of layers in the LSTM
num_epochs = 20 # Number of epochs
batch_size = 20 # Number of samples in a batch
timesteps = 30 # Check 30 previous words to predict the next word
learning_rate = 0.002

In [34]:
corpus = TextProcess()

In [35]:
rep_tensor = corpus.get_data('alice.txt', batch_size)
# rep_tensor is the tensor that contains th eindex of all the words. Each row contains 1659 words
print("rep_tensor: ", rep_tensor.shape)

rep_tensor.shape[0]:  29686
rep_tensor  tensor([   0,    1,    2,  ...,  878, 5289,    5])
batch_size:  20
num_batches:  1484
rep_tensor:  torch.Size([20, 1484])


In [37]:
""" 
corpus.dictionary is a dictionary that maps words to their unique indices (e.g., word2idx in your earlier example).
len(corpus.dictionary) gives the total number of unique words (or tokens) in the vocabulary. This is referred to as the vocabulary size.
"""
vocab_size = len(corpus.dictionary)
print("Vocab size: ", vocab_size)
# The total number of full batches that can be created from the dataset, considering the sequence length (rep_tensor.shape[1]) and the specified timesteps.
num_batches = rep_tensor.shape[1] // timesteps
print("Number of batches: ", num_batches)

Vocab size:  5290
Number of batches:  49


In [38]:
# Example Visual
""" 
rep_tensor.shape = (2, 10):
2 rows: Two sequences (Batch 1 and Batch 2).
10 columns: Ten words in each sequence.
timesteps = 5: Divide each sequence into chunks of 5 words.
num_batches = 10 // 5 = 2: Two chunks (batches) of 5 words can be created from each sequence.
Tensor Visualization (rep_tensor):

Word Index	W1	W2	W3	W4	W5	W6	W7	W8	W9	W10
Sequence 1	"The"	"quick"	"brown"	"fox"	"jumps"	"over"	"the"	"lazy"	"dog"	"again"
Sequence 2	"A"	"cat"	"sat"	"on"	"the"	"mat"	"with"	"a"	"hat"	"too"

Divide into Chunks of timesteps = 5
First Chunk (timesteps = 5):

We take the first 5 words (columns W1 to W5) from each sequence:

Word Index	W1	W2	W3	W4	W5
Sequence 1	"The"	"quick"	"brown"	"fox"	"jumps"
Sequence 2	"A"	"cat"	"sat"	"on"	"the"
Second Chunk (timesteps = 5):

We take the next 5 words (columns W6 to W10) from each sequence:

Word Index	W6	W7	W8	W9	W10
Sequence 1	"over"	"the"	"lazy"	"dog"	"again"
Sequence 2	"mat"	"with"	"a"	"hat"	"too"
"""

In [None]:
""" EMBEDDING
Tensor Visualization:

Word Index	W1	W2	W3	W4	W5	W6	W7	W8	W9	W10
Sequence 1	"The"	"quick"	"brown"	"fox"	"jumps"	"over"	"the"	"lazy"	"dog"	"again"
Sequence 2 "A"	"cat"	"sat"	"on"	"the"	"mat"	"with"	"a"	"hat"	"too"

vocab_size = 12: The vocabulary contains the words: ["<unk>", "The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "again", "cat", "sat", "on", "mat", "with", "a", "hat", "too"].
embed_size = 4: Each word will be represented as a 4-dimensional vector.
hidden_size = 6: The LSTM’s hidden state will have 6 dimensions.
num_layers = 1: We will use a single-layer LSTM.

Embedding Layer
Input: A tensor of word indices, e.g.:
x = [[1, 2, 3, 4, 5],  # Sequence 1: "The quick brown fox jumps"
     [6, 7, 8, 9, 10]] # Sequence 2: "over the lazy dog again"
Shape: (batch_size=2, seq_length=5).

[[[0.1, 0.2, 0.3, 0.4],  # Embedding for "The"
  [0.5, 0.6, 0.7, 0.8],  # Embedding for "quick"
  ...],
 [[0.9, 0.1, 0.2, 0.3],  # Embedding for "over"
  [0.4, 0.5, 0.6, 0.7],  # Embedding for "the"
  ...]] 
  
  LSTM

Input: Embeddings from the previous layer ((2, 5, 4)).
LSTM processes the sequence step by step and generates hidden states:
Output: A tensor of hidden states for each time step:
Shape: (2, 5, 6)  # batch_size, seq_length, hidden_size

Example (simplified hidden states):
[[[0.1, 0.3, 0.5, 0.2, 0.4, 0.6],  # Hidden state at step 1 for Sequence 1
  [0.2, 0.4, 0.6, 0.3, 0.5, 0.7],  # Hidden state at step 2 for Sequence 1
  ...],
 [[0.5, 0.7, 0.9, 0.6, 0.8, 0.1],  # Hidden state at step 1 for Sequence 2
  ...]]

Reshape for Linear Layer

The output from the LSTM is reshaped to combine the batch_size and seq_length dimensions:
Reshaped Output Shape: (2 * 5, 6) = (10, 6)
Example:
[[0.1, 0.3, 0.5, 0.2, 0.4, 0.6],  # Flattened hidden state
 [0.2, 0.4, 0.6, 0.3, 0.5, 0.7],
 ...]

 Linear Layer

Input: Flattened LSTM output ((10, 6)).
Output: Predicted scores for the next word in the vocabulary:
Shape: (10, vocab_size=12)  # One score per word in the vocabulary
Example (simplified scores for 3 words):
[[0.2, 0.1, 0.05, 0.8, 0.3, 0.4, ...],  # Scores for all 12 words
 [0.3, 0.2, 0.1, 0.7, 0.4, 0.2, ...],
 ...]
  """

In [39]:
class TextGenerator(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(TextGenerator, self).__init__()
        # If your embedding_size is 200, then each word would be represented by a dense vector of 200 values.
        self.embed = nn.Embedding(vocab_size, embed_size) # EMBEDDING LAYER is used to convert the word indices into word vectors.
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True) # LSTM LAYER is used to process the word vectors and generate the hidden states.
        self.linear = nn.Linear(hidden_size, vocab_size) # LINEAR LAYER is used to convert the hidden states into word predictions.
        
    def forward(self, x, h):
        # Perform word embedding
        x = self.embed(x)
        # Reshape the input tensor
        #x = x.view(batch_size, timesteps, embed_size)
        out, (h, c) = self.lstm(x, h)
        # Reshape the output from (samples, timesteps, output_features) to a shape appropriate for the FC layer (samples*timesteps, output_features)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))
        # Decode hidden states of all time steps
        out = self.linear(out)
        return out, (h, c)

In [40]:
model = TextGenerator(vocab_size, embed_size, hidden_size, num_layers)

loss_fun = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [42]:
for epoch in range(num_epochs):
    # Set initial hidden and cell states
    states = (torch.zeros(num_layers, batch_size, hidden_size),
              torch.zeros(num_layers, batch_size, hidden_size))
    
    for i in range(0, rep_tensor.size(1) - timesteps, timesteps):
        # Get mini-batch inputs and targets
        inputs = rep_tensor[:, i:i+timesteps] # --> (:, 0:0+timestep) Outputs --> (:, 1:1+timestep) and so on
        targets = rep_tensor[:, (i+1):(i+1)+timesteps]
        #String: "Black hourse is here"
        #Input: "Black hourse" Output " lack hourse i". Our output is actually a delay of our input. So it's not including the B. So it's a delay by one element.
        outputs, _ = model(inputs, states)
        loss = loss_fun(outputs, targets.reshape(-1)) #target should be one dimension so we reshape it

        # Backpropagation and Weight Update
        model.zero_grad()
        loss.backward()
        # Perform Gradient Clipping .clip_value (float or int) – The maximum value of the elements of the input tensor.
        # The gradients are clipped in the range [-clip_value, clip_value]. This is to prevent the exploding gradient problem.
        clip_grad_norm(model.parameters(), 0.5)
        optimizer.step()

        step = (i+1) // timesteps
        if step % 100 == 0:
            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))   
                   


  clip_grad_norm(model.parameters(), 0.5)


Epoch [1/20], Loss: 8.5761
Epoch [2/20], Loss: 6.0195
Epoch [3/20], Loss: 5.2330
Epoch [4/20], Loss: 4.7457
Epoch [5/20], Loss: 4.1741
Epoch [6/20], Loss: 3.7214
Epoch [7/20], Loss: 3.3206
Epoch [8/20], Loss: 2.9692
Epoch [9/20], Loss: 2.5691
Epoch [10/20], Loss: 2.2565
Epoch [11/20], Loss: 1.9041
Epoch [12/20], Loss: 1.5915
Epoch [13/20], Loss: 1.3059
Epoch [14/20], Loss: 1.0712
Epoch [15/20], Loss: 0.8878
Epoch [16/20], Loss: 0.6262
Epoch [17/20], Loss: 0.3834
Epoch [18/20], Loss: 0.2881
Epoch [19/20], Loss: 0.1945
Epoch [20/20], Loss: 0.1182


In [43]:
#Test Model
with torch.no_grad():
    with open('output.txt', 'w') as f:
        # Set initial hidden ane cell states
        state = (torch.zeros(num_layers, 1, hidden_size),
                 torch.zeros(num_layers, 1, hidden_size)) #batch_size = 1 because we testing not training 
        
        # Select one word id randomly and convert it to shape (1, 1)
        input = torch.randint(0, vocab_size, (1,)).long().unsqueeze(1)
        
        for i in range(500):
            # Forward propagate RNN
            output, _ = model(input, state)
            print(output.shape)
            # Sample a word id from the exponential of the output. This is the probability distribution of the next word.
            prob = output.exp()
            word_id = torch.multinomial(prob, num_samples=1).item()
            print(word_id)
            # Fill input with sampled word id for the next time step
            input.fill_(word_id)
            
            # File write
            word = corpus.dictionary.idx2word[word_id]
            word = '\n' if word == '<eos>' else word + ' '
            f.write(word)
            
            if (i+1) % 100 == 0:
                print('Sampled [{}/{}] words and save to {}'.format(i+1, 500, 'output.txt'))

torch.Size([1, 5290])
4099
torch.Size([1, 5290])
20
torch.Size([1, 5290])
27
torch.Size([1, 5290])
28
torch.Size([1, 5290])
262
torch.Size([1, 5290])
95
torch.Size([1, 5290])
13
torch.Size([1, 5290])
44
torch.Size([1, 5290])
4441
torch.Size([1, 5290])
320
torch.Size([1, 5290])
74
torch.Size([1, 5290])
110
torch.Size([1, 5290])
3
torch.Size([1, 5290])
3968
torch.Size([1, 5290])
38
torch.Size([1, 5290])
16
torch.Size([1, 5290])
390
torch.Size([1, 5290])
13
torch.Size([1, 5290])
44
torch.Size([1, 5290])
5
torch.Size([1, 5290])
6
torch.Size([1, 5290])
262
torch.Size([1, 5290])
95
torch.Size([1, 5290])
5
torch.Size([1, 5290])
2204
torch.Size([1, 5290])
3
torch.Size([1, 5290])
858
torch.Size([1, 5290])
20
torch.Size([1, 5290])
6
torch.Size([1, 5290])
333
torch.Size([1, 5290])
5
torch.Size([1, 5290])
5
torch.Size([1, 5290])
1284
torch.Size([1, 5290])
1091
torch.Size([1, 5290])
2628
torch.Size([1, 5290])
112
torch.Size([1, 5290])
22
torch.Size([1, 5290])
28
torch.Size([1, 5290])
262
torch.Size