## Task 0. Import Necessary Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext, datasets, math
from tqdm import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Task 1. Dataset Acquisition

TinyStories Dataset Overview

- Description: A Dataset containing synthetically generated (by GPT-3.5 and GPT-4) short stories that only use a small vocabulary.

- Described in the following paper: https://arxiv.org/abs/2305.07759.

- Size: ~50k stories.

- Number of rows: 2,141,709

In [4]:
# Load the TinyStories dataset
dataset = datasets.load_dataset('roneneldan/TinyStories', trust_remote_code=True)

In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})


In [6]:
# Split the validation set into validation and test sets
validation_test_split = dataset['validation'].train_test_split(test_size=0.5, seed=123)

# Create a new DatasetDict with train, validation, and test splits
dataset = datasets.DatasetDict({
    'train': dataset['train'],
    'validation': validation_test_split['train'],
    'test': validation_test_split['test']
})

# Print the new dataset structure
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 10995
    })
    test: Dataset({
        features: ['text'],
        num_rows: 10995
    })
})


In [7]:
print(dataset['train'].shape)

(2119719, 1)


## Preprocessing

Detail the steps taken to preprocess the text data. (1 points)

1. Load the Dataset<br><br>

2. Define the Tokenizer
- Use `torchtext.data.utils.get_tokenizer` to create a tokenizer.<br><br>

3. Tokenize the Text
- Define a function to tokenize each example in the dataset.
- Use the `map` method to apply the tokenization function to the entire dataset.<br><br>

4. Remove Unnecessary Columns
- Remove the original `'text'` column after tokenization.<br><br>

5. Build Vocabulary and Numericalize Data
- Build a vocabulary using `torchtext.vocab.build_vocab_from_iterator`.
  - Set a minimum frequency threshold (e.g., `min_freq=3`) to filter out rare tokens.
- Add special tokens (e.g., `'<unk>'` and `'<eos>'`) if they don't already exist in the vocabulary. <br><br>

6. Inspect the Results
- Verify that the tokenization was applied correctly by inspecting the tokenized dataset.

### Tokenizing

Tokenize the sentences to text tokens.

In [8]:
# create a tokenizer
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

# define a lambda function to tokenize each example in the dataset:
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}

# Apply Tokenization to the Datase
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})

In [9]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 10995
    })
    test: Dataset({
        features: ['text'],
        num_rows: 10995
    })
})


In [10]:
print(tokenized_dataset['train'][223]['tokens'])

['once', 'upon', 'a', 'time', ',', 'there', 'was', 'a', 'little', 'boy', 'named', 'tom', '.', 'tom', 'loved', 'to', 'look', 'up', 'at', 'the', 'sky', 'every', 'night', '.', 'he', 'would', 'often', 'see', 'a', 'comet', 'speeding', 'by', 'and', 'it', 'would', 'always', 'make', 'him', 'happy', '.', 'one', 'night', ',', 'tom', 'was', 'a', 'bit', 'worried', 'because', 'he', 'couldn', "'", 't', 'see', 'the', 'comet', '.', 'he', 'asked', 'his', 'mom', ',', 'mom', ',', 'where', 'is', 'the', 'comet', '?', 'why', 'can', "'", 't', 'i', 'see', 'it', '?', 'his', 'mom', 'smiled', 'warmly', 'and', 'said', ',', 'don', "'", 't', 'worry', ',', 'tom', '.', 'it', 'will', 'rise', 'again', 'soon', '.', 'tom', 'was', 'confused', '.', 'he', 'asked', ',', 'what', 'does', 'that', 'mean', '?', 'his', 'mom', 'explained', ',', 'it', 'means', 'that', 'it', 'will', 'appear', 'again', '.', 'everyone', 'and', 'everything', 'has', 'their', 'ups', 'and', 'downs', ',', 'just', 'like', 'the', 'comet', '.', 'just', 'be', '

### Numericalizing

We will tell torchtext to add any word that has occurred at least three times in the dataset to the vocabulary because otherwise it would be too big.  Also we shall make sure to add `unk` and `eos`.

In [11]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>'])

In [12]:
print(len(vocab))

34614


In [13]:
print(vocab.get_itos()[:10])

['<unk>', '<eos>', '.', 'the', 'and', ',', 'to', 'a', 'was', 'he']


Prepare the batch loader

### Prepare data

Given "Chaky loves eating at AIT", and "I really love deep learning", and given batch size = 3, we will get three batches of data "Chaky loves eating at", "AIT `<eos>` I really", "love deep learning `<eos>`".  

In [14]:
def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example['tokens']:
            tokens = example['tokens'].append('<eos>')
            tokens = [vocab[token] for token in example['tokens']]
            data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches) #view vs. reshape (whether data is contiguous)
    return data #[batch size, seq len]

In [15]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'],  vocab, batch_size)

In [16]:
train_data.shape

torch.Size([128, 3446557])

## Task 2. Model Training

Describe the model architecture and the training process. (1 points)

**Model Architecture**

The model is an **LSTM-based Language Model** implemented in PyTorch. It consists of the following key components:

1. **Embedding Layer**:
   - Converts token indices into dense vectors of size `emb_dim`.
   - Input: `[batch_size, seq_len]`
   - Output: `[batch_size, seq_len, emb_dim]`

2. **LSTM Layer**:
   - Processes the sequence of embeddings to capture contextual information.
   - Input: `[batch_size, seq_len, emb_dim]`
   - Output:
     - `output`: `[batch_size, seq_len, hid_dim]` (hidden states for each time step)
     - `hidden`: `[num_layers, batch_size, hid_dim]` (final hidden state and cell state)

3. **Dropout Layer**:
   - Applied after the embedding and LSTM layers to prevent overfitting.

4. **Fully Connected Layer**:
   - Maps the LSTM output to the vocabulary size (`vocab_size`).
   - Input: `[batch_size, seq_len, hid_dim]`
   - Output: `[batch_size, seq_len, vocab_size]`

5. **Initialization**:
   - Weights are initialized uniformly to small values for stable training.


**Training Process**

### Data Preparation
1. **Tokenization and Numericalization**:
   - Text data is tokenized and converted into integer indices using a vocabulary.
   - Special tokens like `<eos>` (end of sequence) are added.

2. **Batching**:
   - Numericalized data is split into batches of size `batch_size`.
   - Each batch is divided into sequences of length `seq_len`.

### Training Loop
1. **Initialization**:
   - The LSTM hidden state is initialized to zeros at the start of each epoch.

2. **Forward Pass**:
   - The model predicts the next token for each position in the sequence.
   - The output is reshaped for the loss function.

3. **Loss Calculation**:
   - **CrossEntropyLoss** computes the difference between predicted and actual tokens.
   - Loss is averaged over the sequence length.

4. **Backpropagation**:
   - Gradients are computed and clipped to prevent exploding gradients.
   - The optimizer updates the model parameters.

5. **Evaluation**:
   - The model is evaluated on the validation set after each epoch.
   - The learning rate is adjusted using a scheduler if the validation loss plateaus.

6. **Checkpointing**:
   - The model with the best validation loss is saved.


## Hyperparameters
- **`vocab_size`**: Size of the vocabulary.(9879)
- **`emb_dim`**: Dimension of the embedding layer (1024).
- **`hid_dim`**: Dimension of the LSTM hidden state (1024).
- **`num_layers`**: Number of LSTM layers (2).
- **`dropout_rate`**: Dropout probability (0.65).
- **`batch_size`**: Number of sequences per batch (128).
- **`seq_len`**: Length of each sequence (50).
- **`clip`**: Gradient clipping threshold (0.25).
- **`lr`**: Learning rate (1e-3).
- **`n_epochs`**: Number of training epochs (50).

---

## Metrics
- **Perplexity**:
  - Used to evaluate the model's performance.
  - Defined as `exp(loss)`.
  - Lower perplexity indicates better performance.

---

## Training Output
During training, the following metrics are printed for each epoch:
- **Train Perplexity**: Perplexity on the training set.
- **Valid Perplexity**: Perplexity on the validation set.


In [17]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim    = hid_dim
        self.emb_dim    = emb_dim
        
        self.embedding  = nn.Embedding(vocab_size, emb_dim)
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout    = nn.Dropout(dropout_rate)
        self.fc         = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
    
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_other)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #We
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim,   
                self.hid_dim).uniform_(-init_range_other, init_range_other) #Wh
    
    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
        
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #not to be used for gradient computation
        cell   = cell.detach()
        return hidden, cell
        
    def forward(self, src, hidden):
        #src: [batch_size, seq len]
        embedding = self.dropout(self.embedding(src)) #harry potter is
        #embedding: [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden)
        #ouput: [batch size, seq len, hid dim]
        #hidden: [num_layers * direction, seq len, hid_dim]
        output = self.dropout(output)
        prediction =self.fc(output)
        #prediction: [batch_size, seq_len, vocab_size]
        return prediction, hidden

## Training 

Follows very basic procedure.  One note is that some of the sequences that will be fed to the model may involve parts from different sequences in the original dataset or be a subset of one (depending on the decoding length). For this reason we will reset the hidden state every epoch, this is like assuming that the next batch of sequences is probably always a follow up on the previous in the original dataset.

In [18]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3                     

In [19]:
model      = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer  = optim.Adam(model.parameters(), lr=lr)
criterion  = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 87,717,686 trainable parameters


In [20]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [21]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, seq len]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [22]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

Here we will be using a `ReduceLROnPlateau` learning scheduler which decreases the learning rate by a factor, if the loss don't improve by a certain epoch.

In [24]:
n_epochs = 1  # Train for only 1 epoch
seq_len  = 50 #<----decoding length
clip    = 0.25

# Learning rate scheduler
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

# Track the best validation loss
best_valid_loss = float('inf')

# Training loop
for epoch in range(n_epochs):
    
    # Train the model for 1 epoch
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    
    # Evaluate the model on the validation set
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    # Adjust the learning rate based on validation loss
    lr_scheduler.step(valid_loss)

    # Save the model if it achieves the best validation loss
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm_v2.pt')

    # Print training and validation perplexity
    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                                   

	Train Perplexity: 12.258
	Valid Perplexity: 7.973


Training took a whooping 4 Hours and 14 mins for 1 epoch.

## Testing

In [27]:
model.load_state_dict(torch.load('best-val-lstm_lm_v2.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 8.008


## Real-world inference

Here we take the prompt, tokenize, encode and feed it into the model to get the predictions.  We then apply softmax while specifying that we want the output due to the last word in the sequence which represents the prediction for the next word.  We divide the logits by a temperature value to alter the model’s confidence by adjusting the softmax probability distribution.

Once we have the Softmax distribution, we randomly sample it to make our prediction on the next word. If we get <unk> then we give that another try.  Once we get <eos> we stop predicting.
    
We decode the prediction back to strings last lines.

In [28]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [29]:
prompt = 'Harry Potter is '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
harry potter is a good animal . â€ he thanked him and hopped away . the end .

0.7
harry potter is a great animal . and he has lots of new friends .

0.75
harry potter is a great animal . and he has lots of new friends .

0.8
harry potter is a great animal . yeah , we are wealthy ! said his brother , giving the rabbit a big hug . bobby was so proud of his new discovery .

1.0
harry potter is on the other side of the field . would guide us to the top of the hill ? peace are the fastest . every year , letâ€™s go and explore

