# Production QLoRA Training — Llama-2-13B on H100

Enterprise-grade fine-tuning pipeline optimized for NVIDIA H100 80GB HBM3.

**Target**: 5,000 steps in ~45–60 minutes at ~1.2–1.6 it/s.

In [None]:
import os
os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
!pip install -q -U bitsandbytes accelerate peft transformers datasets tqdm faiss-cpu sentence-transformers flash-attn --no-build-isolation

try:
    import bitsandbytes
    print(f'[OK] bitsandbytes {bitsandbytes.__version__}')
except ImportError:
    print('[FATAL] bitsandbytes not found. Restarting runtime...')
    import os; os.kill(os.getpid(), 9)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 0. Hardware Diagnostics

In [None]:
import torch
import time

# === H100 Hardware Verification ===
assert torch.cuda.is_available(), 'FATAL: No CUDA device found'

gpu_name = torch.cuda.get_device_name(0)
vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3

print('=' * 60)
print('HARDWARE DIAGNOSTICS')
print('=' * 60)
print(f'  GPU:           {gpu_name}')
print(f'  VRAM:          {vram_gb:.1f} GB')
print(f'  CUDA Version:  {torch.version.cuda}')
print(f'  PyTorch:       {torch.__version__}')
print(f'  BF16 Support:  {torch.cuda.is_bf16_supported()}')

# Enable TF32 for H100 tensor core acceleration
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
print(f'  TF32:          Enabled')

if vram_gb < 70:
    print(f'  ⚠ WARNING: This notebook is optimized for H100 (80GB). Current GPU has {vram_gb:.0f} GB.')
else:
    print(f'  ✓ H100 detected. Full speed mode.')
print('=' * 60)

## 1. Load Model — 4-bit QLoRA + Flash Attention 2

- **No gradient checkpointing** (H100 has headroom)
- **Flash Attention 2** for O(N) memory attention
- **BF16 compute dtype** for H100 tensor cores

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from tqdm import tqdm

MODEL_NAME = 'NousResearch/Llama-2-13b-hf'
MAX_LENGTH = 2048

print(f'Loading {MODEL_NAME} in 4-bit NF4...')

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = MAX_LENGTH

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map='auto',
    attn_implementation='flash_attention_2'
)

# Prepare for QLoRA — explicitly disable gradient checkpointing
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)

# Freeze all base model parameters
for param in model.parameters():
    param.requires_grad = False

print('Base model loaded. Applying LoRA...')

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias='none',
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']
)

model = get_peft_model(model, peft_config)

# Verify only LoRA params are trainable
trainable, total = 0, 0
for p in model.parameters():
    total += p.numel()
    if p.requires_grad:
        trainable += p.numel()

print('=' * 60)
print('LORA CONFIGURATION')
print('=' * 60)
print(f'  Total params:     {total:,}')
print(f'  Trainable params: {trainable:,}')
print(f'  Trainable %:      {100 * trainable / total:.2f}%')
print(f'  LoRA rank:        32')
print(f'  LoRA alpha:       64')
print(f'  Targets:          q_proj, k_proj, v_proj, o_proj')
print(f'  Sequence length:  {MAX_LENGTH}')
print('=' * 60)

## 2. Prepare Streaming Dataset

In [None]:
from datasets import load_dataset

NUM_SAMPLES = 1_000_000

print(f'Configuring stream for {NUM_SAMPLES:,} samples...')

raw_dataset = load_dataset(
    'HuggingFaceFW/fineweb-edu',
    split='train',
    streaming=True
)

def tokenize_stream(examples):
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        max_length=MAX_LENGTH,
        padding='max_length'
    )
    return {
        'input_ids': tokenized['input_ids'],
        'attention_mask': tokenized['attention_mask']
    }

sample = next(iter(raw_dataset))
all_columns = list(sample.keys())

tokenized_dataset = raw_dataset.map(
    tokenize_stream,
    batched=True,
    remove_columns=all_columns,
    batch_size=1000
)

shuffled_dataset = tokenized_dataset.shuffle(seed=42, buffer_size=10_000).take(NUM_SAMPLES)
print('Dataset stream ready.')

## 3. Training Configuration — H100 Production

| Parameter | Value |
| --- | --- |
| Batch Size | 16 |
| Grad Accumulation | 1 (none) |
| Grad Checkpointing | OFF |
| Precision | BF16 |
| Optimizer | paged_adamw_32bit |
| LR | 1e-4 (cosine, 3% warmup) |
| Workers | 8 + pin_memory + persistent |

In [None]:
output_dir = '/content/drive/MyDrive/fineweb_edu_llama2_13b/checkpoints'
os.makedirs(output_dir, exist_ok=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=1e-4,
    lr_scheduler_type='cosine',
    warmup_ratio=0.03,
    max_steps=5000,
    bf16=True,
    fp16=False,
    optim='paged_adamw_32bit',
    logging_steps=20,
    save_steps=500,
    save_total_limit=3,
    report_to='none',
    remove_unused_columns=False,
    dataloader_num_workers=8,
    dataloader_pin_memory=True,
    dataloader_drop_last=True,
    dataloader_persistent_workers=True
)

print('=' * 60)
print('TRAINING ARGUMENTS')
print('=' * 60)
print(f'  Batch Size:          16')
print(f'  Grad Accumulation:   1')
print(f'  Max Steps:           5,000')
print(f'  Precision:           BF16')
print(f'  Grad Checkpointing:  OFF')
print(f'  Optimizer:           paged_adamw_32bit')
print(f'  LR:                  1e-4 (cosine, 3% warmup)')
print(f'  Dataloader Workers:  8 (persistent, pinned)')
print(f'  TF32:                Enabled')
print(f'  Checkpoints:         {output_dir}')
print('=' * 60)

In [None]:
import logging
from transformers import TrainerCallback

class ThroughputCallback(TrainerCallback):
    """Logs throughput and warns if below target."""
    def __init__(self, max_length, batch_size, log_interval=100):
        self.max_length = max_length
        self.batch_size = batch_size
        self.log_interval = log_interval
        self.start_time = None
        self.start_step = 0

    def on_step_begin(self, args, state, control, **kwargs):
        if self.start_time is None:
            self.start_time = time.time()
            self.start_step = state.global_step

    def on_log(self, args, state, control, logs=None, **kwargs):
        if self.start_time is None or state.global_step <= self.start_step:
            return
        elapsed = time.time() - self.start_time
        steps_done = state.global_step - self.start_step
        if steps_done == 0:
            return

        steps_per_sec = steps_done / elapsed
        tokens_per_sec = steps_per_sec * self.batch_size * self.max_length
        remaining_steps = args.max_steps - state.global_step
        eta_min = remaining_steps / steps_per_sec / 60 if steps_per_sec > 0 else float('inf')

        print(f'  [PERF] Step {state.global_step}/{args.max_steps} | '
              f'{steps_per_sec:.2f} it/s | '
              f'{tokens_per_sec:,.0f} tok/s | '
              f'ETA: {eta_min:.0f} min')

        if steps_per_sec < 1.0 and steps_done > 50:
            print(f'  ⚠ WARNING: Throughput {steps_per_sec:.2f} it/s is below target (1.2 it/s). Check config.')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=shuffled_dataset,
    data_collator=data_collator,
    callbacks=[ThroughputCallback(MAX_LENGTH, batch_size=16)]
)

print('Trainer ready.')

## 4. Train (Auto-Resume + OOM Safety)

In [None]:
from transformers.trainer_utils import get_last_checkpoint

last_checkpoint = get_last_checkpoint(output_dir)

train_start = time.time()

try:
    if last_checkpoint is not None:
        print(f'Resuming from checkpoint: {last_checkpoint}')
        trainer.train(resume_from_checkpoint=last_checkpoint)
    else:
        print('Starting fresh Llama-2-13B QLoRA training...')
        trainer.train()

    train_elapsed = time.time() - train_start
    print('=' * 60)
    print(f'TRAINING COMPLETE — {train_elapsed / 60:.1f} minutes')
    print('=' * 60)

except torch.cuda.OutOfMemoryError:
    print('\n' + '=' * 60)
    print('FATAL: CUDA Out of Memory')
    print('=' * 60)
    print(f'  Allocated: {torch.cuda.memory_allocated() / 1024**3:.1f} GB')
    print(f'  Reserved:  {torch.cuda.memory_reserved() / 1024**3:.1f} GB')
    print(f'  Total:     {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB')
    print('  Action:    Reduce per_device_train_batch_size to 8 and retry.')
    print('=' * 60)
    torch.cuda.empty_cache()

In [None]:
final_model_dir = '/content/drive/MyDrive/fineweb_edu_llama2_13b/final_model'
print(f'Saving LoRA adapters to: {final_model_dir}')
trainer.save_model(final_model_dir)
tokenizer.save_pretrained(final_model_dir)
print('Model saved successfully.')

## 5. Build RAG Index

In [None]:
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

RAG_SAMPLES = 100_000
RAG_DIR = '/content/drive/MyDrive/fineweb_edu_llama2_13b/rag_index'
os.makedirs(RAG_DIR, exist_ok=True)

passages = []
rag_stream = raw_dataset.take(RAG_SAMPLES)

print('Extracting passages...')
for row in tqdm(rag_stream, total=RAG_SAMPLES):
    text = row['text'].strip()
    for i in range(0, len(text), 500):
        chunk = text[i:i + 500].strip()
        if len(chunk) > 50:
            passages.append(chunk)

print(f'Encoding {len(passages):,} passages...')
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = embedder.encode(passages, show_progress_bar=True, batch_size=256, convert_to_numpy=True)

index = faiss.IndexFlatIP(embeddings.shape[1])
faiss.normalize_L2(embeddings)
index.add(embeddings)

faiss.write_index(index, os.path.join(RAG_DIR, 'faiss_index.bin'))
np.save(os.path.join(RAG_DIR, 'passages.npy'), np.array(passages, dtype=object))
print(f'RAG index saved: {len(passages):,} passages, dim={embeddings.shape[1]}')