In [1]:
import os
import subprocess
import sys
from pathlib import Path
import site

import tiktoken
import torch
from google.colab import drive

import os
import subprocess
import sys
from pathlib import Path

import tiktoken
import torch
from google.colab import drive

def _run_uv_command_quietly(command: list[str], cwd: Path) -> None:
    full_command = ["uv"] + command
    try:
        result = subprocess.run(
            full_command,
            cwd=cwd,
            check=True,
            capture_output=True,
            text=True,
            env={
                **os.environ,
                "UV_CONSTRAINT": "",
                "UV_BUILD_CONSTRAINT": "",
                "UV_PRERELEASE": "if-necessary-or-explicit"
            }
        )
        if result.stderr:
            print(f"warning (uv {' '.join(command)}): {result.stderr.strip()}")
    except subprocess.CalledProcessError as e:
        print(f"error: uv command '{' '.join(full_command)}' failed.")
        print(f"stdout: {e.stdout.strip()}")
        print(f"stderr: {e.stderr.strip()}")
        raise # re-raise the exception after printing details

def setup_colab_environment(project_root: str, package_name: str) -> None:
    """
    sets up the python environment in google colab for a project.
    this includes installing dependencies with uv and ensuring the project is importable.
    minimal output is produced.

    args:
        project_root: the absolute path to your project's root directory.
                      e.g., '/content/llm_e2e/'
        package_name: the top-level name of your python package that should be importable.
                      e.g., 'llm_e2e' if you do 'import llm_e2e'
    """
    if 'google.colab' not in sys.modules:
        return

    print("setting up colab environment...")
    proj_path = Path(project_root)

    if not (proj_path / "pyproject.toml").is_file():
        raise FileNotFoundError(f"'pyproject.toml' not found in {proj_path}. cannot proceed.")

    if str(proj_path) not in sys.path:
        sys.path.insert(0, str(proj_path))

    print("installing dependencies with uv...")
    _run_uv_command_quietly(["sync"], cwd=proj_path)

    if not os.path.exists('/content/drive/MyDrive/llm_e2e/'):
        print("mounting google drive...")
        drive.mount('/content/drive')
    else:
        print("google drive already mounted.")


def setup_cuda(cfg):
    if not torch.cuda.is_available():
        return

    assert cfg.device == 'cuda', "cfg.device must be 'cuda' if CUDA is available."
    print(f"cuda version: {torch.version.cuda}")
    capability = torch.cuda.get_device_capability()
    if capability[0] >= 7:
        torch.set_float32_matmul_precision("high")
        print("uses tensor cores")
    else:
        print("tensor cores not supported on this gpu.")


project_root = '/content/llm_e2e/'
package_name = 'llm_e2e' # adjust this to your actual top-level package name
setup_colab_environment(project_root, package_name)


setting up colab environment...
installing dependencies with uv...
Audited 178 packages in 0.31ms
Resolved 174 packages in 243ms
   Building llm-e2e @ file:///content/llm_e2e
      Built llm-e2e @ file:///content/llm_e2e
Prepared 1 package in 849ms
Uninstalled 1 package in 0.58ms
Installed 1 package in 1ms
 ~ llm-e2e==0.1.0 (from file:///content/llm_e2e)
'llm_e2e' package imported successfully.
google drive already mounted.


In [2]:
from llm_e2e import GPT2Config, GPT2Model, StreamingDatasetGenerator
config_yaml = f"gpt2_bert_corpus_gpu.yaml"
cfg = GPT2Config.from_yaml(f"{project_root}/config/{config_yaml}")
encoding = tiktoken.get_encoding(cfg.encoding_name)
setup_cuda(cfg)

train_dataset = StreamingDatasetGenerator(cfg, encoding=encoding)
val_dataset = StreamingDatasetGenerator(cfg, encoding=encoding)

loaded config from: /content/llm_e2e//config/gpt2_bert_corpus_gpu.yaml


README.md:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/39 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/39 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/39 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/39 [00:00<?, ?it/s]

In [3]:
import torch
import itertools
from datetime import datetime

def generate_text(model, tokenizer, prompt: str, max_tokens=20) -> str:
    model.eval()
    device = next(model.parameters()).device

    encoded = tokenizer.encode(prompt)
    encoded_ids = torch.tensor([encoded], dtype=torch.long).to(device)

    # Model inference
    with torch.no_grad():
        output_token_ids = model.generate(encoded_ids, max_tokens)

    decoded_ids_list = output_token_ids[0].cpu().tolist()
    decoded_text = tokenizer.decode(decoded_ids_list)
    return decoded_text

def estimate_loss(model, loader, device, eval_iters):
    model.eval()
    losses = torch.zeros(eval_iters)
    for i, (X, Y) in enumerate(itertools.islice(loader, eval_iters)):
        X, Y = X.to(device), Y.to(device)
        logits, loss = model(X, Y)
        losses[i] = loss.item()
    model.train()
    return losses.mean()

@torch.no_grad
def evaluate_model(model, train_loader, val_loader, device, eval_iters):
    """
    Args:
      model: to evaluate
      train_loader: training dataset iterator
      val_loader: validation dataset iterator
      eval_iters: the number of iterations to pull from the loaders

    Returns:
      dict with 'train' and 'val' loss
  """
    train_loss = estimate_loss(model, train_loader, device, eval_iters)
    val_loss = estimate_loss(model, val_loader, device, eval_iters)
    return {'train': train_loss, 'val': val_loss}

def print_gpu_memory_stats(checkpoint_name, device):
    if torch.cuda.is_available() and device.type == 'cuda':
        allocated = torch.cuda.memory_allocated(device) / (1024**2)
        reserved = torch.cuda.memory_reserved(device) / (1024**2)
        max_allocated = torch.cuda.max_memory_allocated(device) / (1024**2)
        max_reserved = torch.cuda.max_memory_reserved(device) / (1024**2)

        print(f"--- GPU Memory Stats at: {checkpoint_name} ({device}) ---")
        print(f"  Current Allocated: {allocated:.2f} MB")
        print(f"  Current Reserved:  {reserved:.2f} MB")
        print(f"  Peak Allocated:    {max_allocated:.2f} MB")
        print(f"  Peak Reserved:     {max_reserved:.2f} MB")
        print("----------------------------------------------------")

def train_model(model, train_loader, val_loader, optimizer, gen_f, cfg):
    device = torch.device(cfg.device) # Ensure device object

    if device.type == 'cuda':
        print("Starting training on CUDA device. Initializing memory stats.")
        # Reset peak stats at the beginning of training if you want to track peaks per training run
        torch.cuda.reset_peak_memory_stats(device)
        print_gpu_memory_stats("Start of training_model", device)

    print(f"started training model with {cfg.n_params:_} parameters. model parameters file: {cfg.save_filename}")

    for epoch in range(cfg.num_epochs):
        model.train()
        running_loss = 0.0
        print(f"[{epoch + 1} / {cfg.num_epochs}]: starting at {datetime.now()}, will show running loss every {cfg.log_interval} steps, will eval every {cfg.eval_interval} steps")
        if device.type == 'cuda':
            print_gpu_memory_stats(f"Start of Epoch {epoch + 1}", device)

        for i, (X, Y) in enumerate(train_loader):
            X, Y = X.to(cfg.device), Y.to(cfg.device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            logits, loss = model(X, Y)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if (i + 1) % cfg.log_interval == 0:
                print(f"[{epoch + 1}  {i + 1:5d}]: running loss {running_loss / cfg.log_interval:.3f}")
                running_loss = 0.0

            if (i + 1) % cfg.eval_interval == 0:
                losses = evaluate_model(model, train_loader, val_loader, device, eval_iters=cfg.eval_iters)
                print(f"[{epoch + 1}  {i + 1:5d}]: train loss: {losses['train']:.4f}, val loss: {losses['val']:.4f}, eval_iters: {cfg.eval_iters}")
                completion = gen_f(model)
                print(f"[{epoch + 1}  {i + 1:5d}]: {completion}")
                print_gpu_memory_stats(f"[{epoch + 1}  {i + 1:5d}]", device)
                torch.save(model._orig_mod.state_dict(), cfg.save_filename)


In [None]:
model = GPT2Model(cfg)

load_weights = True
if load_weights and os.path.exists(cfg.save_filename):
    model = torch.load(cfg.save_filename, weights_only=True)
    print(f"loaded model weights: {cfg.save_filename}")

if cfg.device == 'cuda':
    model.to(torch.bfloat16)

model.to(cfg.device)

if cfg.compile_model:
    model = torch.compile(model)

gen_f = lambda m: generate_text(m, encoding, "Paris is")

optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)
train_model(model, train_dataset, val_dataset, optimizer, gen_f, cfg=cfg)

torch.save(model, cfg.save_filename)

print('Finished Training')

started training model with 30_023_808 parameters. model parameters file: shahrukhx01_wikipedia-bookscorpus-en-preprocessed.30023808.pt
[1 / 20]: starting at 2025-06-10 21:06:18.352641, will show running loss every 100 steps, will eval every 500 steps


No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


In [7]:
import gc
import torch

def safe_delete(var_names: list[str]) -> None:
    """
    attempts to delete specified variables from local and global scopes
    to facilitate garbage collection by removing name bindings.
    """
    for var_name in var_names:
      try:
          del locals()[var_name]
      except:
        try:
            del globals()[var_name]
        except:
            pass

if torch.cuda.is_available():
    torch.cuda.empty_cache()

safe_delete(['model', 'optimizer', 'train_loader', 'val_loader'])
gc.collect()

if torch.cuda.is_available():
    torch.cuda.empty_cache()