<a href="https://colab.research.google.com/github/calmrocks/master-machine-learning-engineer/blob/main/GenAI/BasicLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training a Small Language Model in Google Colab

## Import Libraries
First, we need to install the required libraries:

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

ModuleNotFoundError: No module named 'torchdata.datapipes'

## Set Device (CPU/GPU)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


## Define Hyperparameters

In [6]:
embed_dim = 128    # Embedding dimension
num_heads = 4      # Number of attention heads
num_layers = 2     # Number of transformer layers
ffn_hidden_dim = 512  # Feed-forward network hidden dimension
seq_length = 20    # Maximum sequence length
batch_size = 32    # Batch size for training
num_epochs = 10    # Number of training epochs

## Load and Prepare Dataset

In [8]:
# Tokenizer
tokenizer = get_tokenizer("basic_english")

# Function to yield data, tokenized
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

# Load IMDB dataset
train_iter = IMDB(split='train')
test_iter = IMDB(split='test')

# Build vocabulary
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

# Text processing function
def collate_batch(batch):
    labels, texts = [], []
    for (label, txt) in batch:
        labels.append(label)
        processed_text = torch.tensor([vocab[token] for token in tokenizer(txt)], dtype=torch.long)
        texts.append(processed_text)
    labels = torch.tensor(labels, dtype=torch.float32)
    texts = nn.utils.rnn.pad_sequence(texts, padding_value=vocab["<pad>"])
    return labels, texts

# DataLoader
train_dataloader = DataLoader(train_iter, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_iter, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

NameError: name 'get_tokenizer' is not defined

## Define the Transformer Model

In [None]:
class SmallTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, ffn_hidden_dim, seq_length):
        super(SmallTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_length, embed_dim))
        encoder_layers = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ffn_hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.fc_out = nn.Linear(embed_dim, vocab_size)

    def forward(self, src):
        embedded = self.embedding(src) + self.positional_encoding
        encoded = self.transformer_encoder(embedded)
        output = self.fc_out(encoded)
        return output

model = SmallTransformer(vocab_size, embed_dim, num_heads, num_layers, ffn_hidden_dim, seq_length).to(device)
print(model)

## Define Loss Function and Optimizer

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



Starting training...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdenny-wang-au[0m ([33mdenny-wang-au-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


TypeError: 'int' object is not subscriptable

## Training Loop

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for labels, texts in train_dataloader:
        labels = labels.to(device)
        texts = texts.to(device)

        optimizer.zero_grad()

        output = model(texts)
        output = output.view(-1, vocab_size)
        texts = texts.view(-1)

        loss = criterion(output, texts)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

## Evaluation (Optional)

In [None]:
model.eval()
total_eval_loss = 0

with torch.no_grad():
    for labels, texts in test_dataloader:
        labels = labels.to(device)
        texts = texts.to(device)

        output = model(texts)
        output = output.view(-1, vocab_size)
        texts = texts.view(-1)

        loss = criterion(output, texts)
        total_eval_loss += loss.item()

avg_eval_loss = total_eval_loss / len(test_dataloader)
print(f'Evaluation Loss: {avg_eval_loss:.4f}')

## Important Notes and Best Practices:

1. **Memory Management**:
   - Monitor GPU memory usage in Colab
   - Use smaller batch sizes if running out of memory
   - Consider gradient checkpointing for larger models

2. **Training Time**:
   - Even small models can take significant time to train
   - Start with small datasets for testing
   - Increase dataset size gradually

3. **Model Size**:
   - GPT-2 small is already 124M parameters
   - Larger models need more GPU memory and training time
   - Consider using quantization for larger models

4. **Hyperparameter Tuning**:
   - Learning rate is crucial for stable training
   - Adjust batch size based on available memory
   - Monitor loss to detect training issues