<a href="https://colab.research.google.com/github/calmrocks/master-machine-learning-engineer/blob/main/GenAI/BasicLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training a Small Language Model in Google Colab

## Import Libraries
First, we need to install the required libraries:

In [3]:
!pip install datasets

Collecting datasets
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import DataLoader
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Set Device (CPU/GPU)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Define Hyperparameters

In [6]:
embed_dim = 128    # Embedding dimension
num_heads = 4      # Number of attention heads
num_layers = 2     # Number of transformer layers
ffn_hidden_dim = 512  # Feed-forward network hidden dimension
seq_length = 20    # Maximum sequence length
batch_size = 32    # Batch size for training
num_epochs = 10    # Number of training epochs

## Load and Prepare Dataset

In [7]:
# Tokenizer
tokenizer = nltk.word_tokenize

# Function to yield data, tokenized
def yield_tokens(data_iter):
    for item in data_iter:
        yield tokenizer(item['text'])

# Load AG_NEWS dataset
dataset = load_dataset('ag_news')

# Build vocabulary
texts = [" ".join(tokenizer(item['text'])) for item in dataset['train']]
vectorizer = CountVectorizer()
vectorizer.fit(texts)
vocab = vectorizer.vocabulary_
vocab_size = len(vocab) + 1  # +1 for <unk> token
print(f"Vocabulary size: {vocab_size}")

# Text processing function
def text_to_indices(text):
    tokens = tokenizer(text)
    indices = [vocab.get(token, vocab_size - 1) for token in tokens]  # vocab_size - 1 is the index for <unk>
    return indices

def collate_batch(batch):
    labels, texts = [], []
    for item in batch:
        labels.append(item['label'])
        processed_text = text_to_indices(item['text'])
        texts.append(processed_text)
    labels = torch.tensor(labels, dtype=torch.long)
    texts = nn.utils.rnn.pad_sequence([torch.tensor(text, dtype=torch.long) for text in texts], padding_value=vocab_size - 1)
    return labels, texts

# DataLoader
train_dataloader = DataLoader(dataset['train'], batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(dataset['test'], batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Vocabulary size: 65004


## Define the Transformer Model

In [8]:
class SmallTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, ffn_hidden_dim, seq_length):
        super(SmallTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # positional_encoding is no longer a Parameter, but is created dynamically in forward
        # self.positional_encoding = nn.Parameter(torch.zeros(1, seq_length, embed_dim))
        encoder_layers = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ffn_hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.fc_out = nn.Linear(embed_dim, vocab_size)

    def forward(self, src):
        # Create positional encoding dynamically based on the input sequence length
        # src.size(1) gives the actual sequence length of the input
        positional_encoding = torch.zeros(1, src.size(1), self.embedding.embedding_dim, device=src.device)
        # use register_buffer to make it a non-trainable parameter, moved to the correct device

        embedded = self.embedding(src) + positional_encoding
        encoded = self.transformer_encoder(embedded)
        output = self.fc_out(encoded)
        return output

model = SmallTransformer(vocab_size, embed_dim, num_heads, num_layers, ffn_hidden_dim, seq_length).to(device)
print(model)



SmallTransformer(
  (embedding): Embedding(65004, 128)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc_out): Linear(in_features=128, out_features=65004, bias=True)
)


## Define Loss Function and Optimizer

In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Training Loop

In [None]:
from IPython.display import clear_output, display

train_losses = []

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    batch_count = 0

    for labels, texts in train_dataloader:
        labels = labels.to(device)
        texts = texts.to(device)

        optimizer.zero_grad()

        output = model(texts)
        output = output.view(-1, vocab_size)
        texts = texts.view(-1)

        loss = criterion(output, texts)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        batch_count += 1

        # Print progress within the epoch
        progress_msg = f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_count}/{len(train_dataloader)}], Loss: {loss.item():.4f}'
        clear_output(wait=True)
        display(progress_msg)

    avg_loss = total_loss / len(train_dataloader)
    print(f'\nEpoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')
    train_losses.append(avg_loss)

'Epoch [6/10], Batch [3061/3750], Loss: 0.0000'

## Evaluation (Optional)

In [1]:
model.eval()
total_eval_loss = 0

with torch.no_grad():
    for labels, texts in test_dataloader:
        labels = labels.to(device)
        texts = texts.to(device)

        output = model(texts)
        output = output.view(-1, vocab_size)
        texts = texts.view(-1)

        loss = criterion(output, texts)
        total_eval_loss += loss.item()

avg_eval_loss = total_eval_loss / len(test_dataloader)
print(f'Evaluation Loss: {avg_eval_loss:.4f}')

NameError: name 'model' is not defined

## Plotting Training Loss

In [None]:
plt.plot(train_losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.show()