## Setup environment

In [1]:
# Check python version
!which python3
!python3 -V

/home/dev-vm/anaconda3/envs/hoangph3/bin/python3
Python 3.9.19


In [2]:
# Install lib
# !python3 -m pip install -q accelerate

In [3]:
# Validate lib version
!python3 -m pip show accelerate
!python3 -m pip show torch
!python3 -m pip show transformers

Name: accelerate
Version: 0.31.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /home/dev-vm/anaconda3/envs/hoangph3/lib/python3.9/site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 
Name: torch
Version: 2.3.0+cu118
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /home/dev-vm/.local/lib/python3.9/site-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu11, nvidia-cuda-cupti-cu11, nvidia-cuda-nvrtc-cu11, nvidia-cuda-runtime-cu11, nvidia-cudnn-cu11, nvidia-cufft-cu11, nvidia-curand-cu11, nvidia-cusolver-cu11, nvidia-cusparse-cu11, nvidia-nccl-cu11, nvidia-nvtx-cu11, sympy, triton, typing-extensions
Required-by: accelerate, deepspeed, sentence-tran

In [4]:
# Check nvidia
!nvidia-smi

Sun Jul  7 16:04:21 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4080        Off |   00000000:01:00.0 Off |                  N/A |
|  0%   56C    P8             16W /  340W |     179MiB /  16376MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
import os
from types import SimpleNamespace

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    get_cosine_schedule_with_warmup
)

os.environ['TOKENIZER_PARALLELISM'] = 'false'

# Config

In [6]:
# Model config
cfg = {
    'model_id': 'h2oai/h2o-danube-1.8b-base',
    'context_length': 512,
    'batch_size': 2,
    'num_epochs': 10,
    'learning_rate': 0.00004,
    'weight_decay': 0.01,
    'seed': 252,
    'logging_steps': 1,
    'device': 'cuda' if torch.cuda.is_available else 'cpu'
}

cfg = SimpleNamespace(**cfg)

# Load data

In [7]:
data_file_path = "./the-verdict.txt"
with open(data_file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

In [8]:
# First 100 characters
print(text_data[:99])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


# Tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id, cache_dir="../cache")

# Dataset & DataLoader

In [10]:
class GPTDataset(Dataset):
    """
    Prepare text generation dataset here!
    Step 1: Tokenize text to token ids
    Step 2: Build input tokens (query)
    Step 3: Build output tokens (response - predict next word)
    """
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt)

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            # input = query
            input_chunk = token_ids[i:i + max_length]
            # target = response (predict next word)
            target_chunk = token_ids[i + 1: i + max_length + 1]
            # add query-response pairs
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader(txt, tokenizer, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):

    # Create dataset
    dataset = GPTDataset(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=0
    )

    return dataloader

In [11]:
torch.manual_seed(seed=cfg.seed)

train_dataloader = create_dataloader(
    txt=text_data,
    tokenizer=tokenizer,
    batch_size=cfg.batch_size,
    max_length=cfg.context_length,
    stride=cfg.context_length,
    shuffle=True,
    drop_last=True,
    num_workers=1
)

In [12]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 20479
Tokens: 5562


In [13]:
print("Train loader:")
for x, y in train_dataloader:
    print("Query:", x.shape, tokenizer.batch_decode(x))
    print("Response:", y.shape, tokenizer.batch_decode(y))
    print()
    # break

Train loader:
Query: torch.Size([2, 512]) ['go under, but he was high above the current--on everlasting foundations, as you say.\n\n"Well, I went off to the house in my most egregious mood--rather moved, Lord forgive me, at the pathos of poor Stroud\'s career of failure being crowned by the glory of my painting him! Of course I meant to do the picture for nothing--I told Mrs. Stroud so when she began to stammer something about her poverty. I remember getting off a prodigious phrase about the honour being _mine_--oh, I was princely, my dear Rickham! I was posing to myself like one of my own sitters.\n\n"Then I was taken up and left alone with him. I had sent all my traps in advance, and I had only to set up the easel and get to work. He had been dead only twenty-four hours, and he died suddenly, of heart disease, so that there had been no preliminary work of destruction--his face was clear and untouched. I had met him once or twice, years before, and thought him insignificant and dingy.

In [14]:
train_tokens = 0
for input_batch, target_batch in train_dataloader:
    train_tokens += input_batch.numel()

print("Training tokens:", train_tokens)

Training tokens: 5120


# Model, Optimizer, and Scheduler

In [15]:
torch.manual_seed(cfg.seed)

model = AutoModelForCausalLM.from_pretrained(
    cfg.model_id,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map=cfg.device,
    cache_dir="../cache",
)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=cfg.learning_rate,
    weight_decay=cfg.weight_decay
)

scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=cfg.num_epochs*len(train_dataloader)
)

In [16]:
print(model)
print('Trainable parameters:', sum(p.numel() for p in model.parameters() if p.requires_grad))
print('Non-trainable parameters:', sum(p.numel() for p in model.parameters() if not p.requires_grad))

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 2560)
    (layers): ModuleList(
      (0-23): 24 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (k_proj): Linear(in_features=2560, out_features=640, bias=False)
          (v_proj): Linear(in_features=2560, out_features=640, bias=False)
          (o_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm()
  )

# Pretraining

In [18]:
for epoch in range(cfg.num_epochs):
    for batch_idx, (input_batch, target_batch) in enumerate(train_dataloader):
        model.train()

        input_batch = input_batch.to(cfg.device)
        target_batch = target_batch.to(cfg.device)
        logits = model(input_batch).logits

        loss = F.cross_entropy(
            logits.flatten(0, 1),
            target_batch.flatten()
        )
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        if not (batch_idx % cfg.logging_steps):
            print(
                f'Epoch: {epoch+1}/{cfg.num_epochs}'
                f' | Batch {batch_idx+1}/{len(train_dataloader)}'
                f' | Loss: {loss.item():.4f}'
            )

OutOfMemoryError: CUDA out of memory. Tried to allocate 10.00 MiB. GPU 

In [19]:
torch.save(model.state_dict(), "my_llm.pth")