# üåô Luna-LM Large ‚Äî H100 Training

**Model:** Large (~350M params) ‚Äî emb_dim=1024, n_heads=16, n_layers=24  
**GPU:** H100 (80GB VRAM)  
**Pipeline:** Pretrain ‚Üí SFT

## Adƒ±mlar
1. GPU doƒürula
2. Google Drive baƒüla
3. Repo kur
4. Corpus y√ºkle
5. Pretrain
6. SFT
7. Modeli indir

In [None]:
# 1. GPU Doƒürula
import torch

print(f'GPU: {torch.cuda.get_device_name(0)}')
print(f'VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB')
print(f'BF16 destekli: {torch.cuda.is_bf16_supported()}')

assert torch.cuda.is_available(), '‚ùå GPU bulunamadƒ±! Runtime > Change runtime type > H100'
print('‚úÖ GPU hazƒ±r!')

In [None]:
# 2. Google Drive Baƒüla
from google.colab import drive
drive.mount('/content/drive')

# Checkpoint ve corpus Drive'a kaydedilecek
DRIVE_DIR = '/content/drive/MyDrive/LunaLM'
import os
os.makedirs(DRIVE_DIR, exist_ok=True)
print(f'‚úÖ Drive baƒülandƒ±: {DRIVE_DIR}')

In [None]:
# 3. Repo Kur
# Se√ßenek A: GitHub'dan clone et (repo public'se)
# !git clone https://github.com/iatagun/Luna-LM.git /content/Luna-LM

# Se√ßenek B: Drive'daki zip'i kullan
# !cp '/content/drive/MyDrive/LunaLM/Luna-LM.zip' /content/
# !unzip -q /content/Luna-LM.zip -d /content/

# Se√ßenek C: GitHub CLI ile clone (√∂nerilen)
!git clone https://github.com/iatagun/Luna-LM.git /content/Luna-LM

import sys
sys.path.insert(0, '/content/Luna-LM')
os.chdir('/content/Luna-LM')
print('‚úÖ Repo hazƒ±r!')

In [None]:
# 4. Baƒüƒ±mlƒ±lƒ±klarƒ± Y√ºkle
!pip install -q transformers datasets

# Flash Attention (H100 i√ßin b√ºy√ºk hƒ±z kazanƒ±mƒ±)
!pip install -q flash-attn --no-build-isolation

print('‚úÖ Paketler y√ºklendi!')

In [None]:
# 5. Corpus'u Y√ºkle
# Se√ßenek A: Drive'dan kopyala
CORPUS_DRIVE = '/content/drive/MyDrive/LunaLM/foundation_corpus_clean.txt'
CORPUS_LOCAL = '/content/Luna-LM/foundation_corpus_clean.txt'

if os.path.exists(CORPUS_DRIVE):
    if not os.path.exists(CORPUS_LOCAL):
        print('Corpus Drive\'dan kopyalanƒ±yor...')
        !cp '{CORPUS_DRIVE}' '{CORPUS_LOCAL}'
    size_gb = os.path.getsize(CORPUS_LOCAL) / 1024**3
    print(f'‚úÖ Corpus hazƒ±r: {size_gb:.2f} GB')
else:
    print('‚ö†Ô∏è  Corpus Drive\'da bulunamadƒ±!')
    print('L√ºtfen foundation_corpus_clean.txt dosyasƒ±nƒ± ≈üuraya y√ºkleyin:')
    print(f'  {CORPUS_DRIVE}')
    print('Veya a≈üaƒüƒ±daki h√ºcreyi √ßalƒ±≈ütƒ±rarak doƒürudan y√ºkleyin:')

In [None]:
# 5b. (Alternatif) Corpus'u doƒürudan y√ºkle
# Bu h√ºcreyi sadece corpus Drive'da yoksa √ßalƒ±≈ütƒ±r

# from google.colab import files
# uploaded = files.upload()  # foundation_corpus_clean.txt se√ß
# !mv foundation_corpus_clean.txt /content/Luna-LM/

In [None]:
# 6. PRETRAIN ‚Äî Large Model (H100 Optimize)

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import json, random, time, os
from datetime import datetime

from luna.tokenizer import PretrainedTurkishTokenizer
from luna.data import create_dataloader_pretrained
from luna.model import GPTModel, MODEL_CONFIGS
from luna.generate import generate_text

# ==========================================
# AYARLAR ‚Äî H100 (80GB) i√ßin optimize
# ==========================================
MODEL_SIZE    = 'large'   # emb_dim=1024, n_heads=16, n_layers=24
BATCH_SIZE    = 32        # H100: 80GB ‚Üí b√ºy√ºk batch
CONTEXT_LEN   = 1024      # Large model i√ßin daha uzun context
NUM_EPOCHS    = 3
LEARNING_RATE = 6e-4      # Large model i√ßin biraz daha y√ºksek
WEIGHT_DECAY  = 0.1
EVAL_FREQ     = 500
EVAL_ITER     = 20
MAX_LINES     = None      # T√ºm corpus (3.6GB)
USE_BF16      = True      # H100 BF16 native destekler ‚Äî √ßok daha hƒ±zlƒ±

device = torch.device('cuda')
dtype  = torch.bfloat16 if USE_BF16 else torch.float32

print(f'Device: {device}')
print(f'DType:  {dtype}')
print(f'Model:  {MODEL_SIZE}')
print(f'Batch:  {BATCH_SIZE}')
print(f'Ctx:    {CONTEXT_LEN}')

In [None]:
# Tokenizer
tokenizer = PretrainedTurkishTokenizer('dbmdz/bert-base-turkish-cased')
vocab_size = tokenizer.vocab_size
print(f'Vocab: {vocab_size:,}')

In [None]:
# Corpus Y√ºkle
corpus_path = '/content/Luna-LM/foundation_corpus_clean.txt'

print('Corpus y√ºkleniyor...')
lines = []
with open(corpus_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if MAX_LINES and i >= MAX_LINES:
            break
        line = line.strip()
        if line:
            lines.append(line)
        if (i+1) % 500000 == 0:
            print(f'  {i+1:,} satƒ±r okundu...')

print(f'‚úÖ {len(lines):,} satƒ±r y√ºklendi')

random.seed(42)
random.shuffle(lines)

split = int(0.95 * len(lines))   # 95/5 split ‚Äî b√ºy√ºk corpus
train_text = '\n'.join(lines[:split])
val_text   = '\n'.join(lines[split:])
del lines

print(f'Train: {len(train_text)/1e9:.2f}B karakter')
print(f'Val:   {len(val_text)/1e6:.1f}M karakter')

In [None]:
# DataLoader
train_loader = create_dataloader_pretrained(
    train_text, tokenizer,
    batch_size=BATCH_SIZE, max_length=CONTEXT_LEN, stride=CONTEXT_LEN, shuffle=True
)
val_loader = create_dataloader_pretrained(
    val_text, tokenizer,
    batch_size=BATCH_SIZE, max_length=CONTEXT_LEN, stride=CONTEXT_LEN, shuffle=False
)

print(f'Train batches: {len(train_loader):,}')
print(f'Val batches:   {len(val_loader):,}')

In [None]:
# Model Olu≈ütur
cfg = MODEL_CONFIGS[MODEL_SIZE]
model_config = {
    'vocab_size':      vocab_size,
    'context_length':  CONTEXT_LEN,
    'emb_dim':         cfg['emb_dim'],
    'n_heads':         cfg['n_heads'],
    'n_layers':        cfg['n_layers'],
    'drop_rate':       0.1,
    'qkv_bias':        False,
}

model = GPTModel(model_config)

# BF16 i√ßin cast
if USE_BF16:
    model = model.to(dtype=torch.bfloat16)

model = model.to(device)

# torch.compile ‚Äî H100'de ~%30 hƒ±z artƒ±≈üƒ±
model = torch.compile(model)

total_params = sum(p.numel() for p in model.parameters())
print(f'‚úÖ Model hazƒ±r: {total_params/1e6:.1f}M parametre')
print(f'   emb_dim={cfg["emb_dim"]}, n_heads={cfg["n_heads"]}, n_layers={cfg["n_layers"]}')

In [None]:
# Optimizer & Scheduler
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    betas=(0.9, 0.95),   # GPT-3 style
    eps=1e-8,
    fused=True           # H100 i√ßin fused optimizer
)

total_steps   = len(train_loader) * NUM_EPOCHS
warmup_steps  = int(total_steps * 0.02)   # %2 warmup ‚Äî b√ºy√ºk model

def lr_lambda(step):
    if step < warmup_steps:
        return step / max(warmup_steps, 1)
    progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1)
    import math
    return 0.1 + 0.9 * 0.5 * (1.0 + math.cos(math.pi * progress))  # min_lr = 10% of max

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

print(f'Total steps:  {total_steps:,}')
print(f'Warmup steps: {warmup_steps:,}')

In [None]:
# Save Dir (Drive'a kaydet ‚Äî session bitse bile korunur)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
save_dir  = f'{DRIVE_DIR}/pretrain_large_{timestamp}'
os.makedirs(save_dir, exist_ok=True)

with open(f'{save_dir}/config.json', 'w') as f:
    json.dump({
        'model_config': model_config,
        'training_config': {
            'model_size': MODEL_SIZE,
            'batch_size': BATCH_SIZE,
            'num_epochs': NUM_EPOCHS,
            'learning_rate': LEARNING_RATE,
            'context_length': CONTEXT_LEN,
            'use_bf16': USE_BF16,
        },
        'tokenizer': 'dbmdz/bert-base-turkish-cased',
        'timestamp': timestamp,
    }, f, indent=2)

print(f'‚úÖ Save dir: {save_dir}')

In [None]:
# PRETRAIN LOOP
from torch.cuda.amp import autocast

train_losses, val_losses, tokens_seen_log = [], [], []
tokens_seen  = 0
global_step  = 0
best_val_loss = float('inf')

print('='*60)
print('PRETRAIN BA≈ûLIYOR')
print('='*60)

for epoch in range(NUM_EPOCHS):
    model.train()
    ep_start = time.time()
    
    for input_batch, target_batch in train_loader:
        input_batch  = input_batch.to(device)
        target_batch = target_batch.to(device)
        
        optimizer.zero_grad()
        
        with autocast(dtype=torch.bfloat16, enabled=USE_BF16):
            logits = model(input_batch)
            loss = torch.nn.functional.cross_entropy(
                logits.flatten(0, 1), target_batch.flatten()
            )
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        tokens_seen += input_batch.numel()
        global_step += 1
        
        if global_step % EVAL_FREQ == 0:
            model.eval()
            with torch.no_grad():
                # Train loss sample
                tl = 0
                for k, (ib, tb) in enumerate(train_loader):
                    if k >= EVAL_ITER: break
                    with autocast(dtype=torch.bfloat16, enabled=USE_BF16):
                        lg = model(ib.to(device))
                        tl += torch.nn.functional.cross_entropy(lg.flatten(0,1), tb.to(device).flatten()).item()
                tl /= EVAL_ITER
                
                # Val loss sample
                vl = 0
                for k, (ib, tb) in enumerate(val_loader):
                    if k >= EVAL_ITER: break
                    with autocast(dtype=torch.bfloat16, enabled=USE_BF16):
                        lg = model(ib.to(device))
                        vl += torch.nn.functional.cross_entropy(lg.flatten(0,1), tb.to(device).flatten()).item()
                vl /= EVAL_ITER
            
            train_losses.append(tl)
            val_losses.append(vl)
            tokens_seen_log.append(tokens_seen)
            
            lr = optimizer.param_groups[0]['lr']
            print(f'Ep {epoch+1} | Step {global_step:,} | '
                  f'Train: {tl:.4f} | Val: {vl:.4f} | '
                  f'LR: {lr:.2e} | Tokens: {tokens_seen/1e6:.1f}M')
            
            if vl < best_val_loss:
                best_val_loss = vl
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'epoch': epoch, 'step': global_step,
                    'val_loss': vl, 'tokens_seen': tokens_seen,
                }, f'{save_dir}/best_model.pt')
                print(f'  ‚úÖ Best model! Val: {vl:.4f}')
            
            model.train()
    
    ep_time = (time.time() - ep_start) / 60
    print(f'\nEpoch {epoch+1} tamamlandƒ± ({ep_time:.1f} dk)\n')
    
    # Epoch checkpoint
    torch.save({'model_state_dict': model.state_dict(), 'epoch': epoch},
               f'{save_dir}/epoch_{epoch+1}.pt')

print('‚úÖ PRETRAIN TAMAMLANDI!')
print(f'Best val loss: {best_val_loss:.4f}')

In [None]:
# Loss Grafiƒüi
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(tokens_seen_log, train_losses, label='Train')
ax.plot(tokens_seen_log, val_losses,   label='Val', linestyle='--')
ax.set_xlabel('Tokens Seen')
ax.set_ylabel('Loss')
ax.set_title('Luna-LM Large ‚Äî Pretrain')
ax.legend(); ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(f'{save_dir}/pretrain_loss.png', dpi=200)
plt.show()

---
## SFT ‚Äî Alpaca Turkish (82K)

In [None]:
# SFT Dataseti ƒ∞ndir
from datasets import load_dataset
import json

SFT_JSONL = '/content/Luna-LM/sft/sft_dataset.jsonl'

if not os.path.exists(SFT_JSONL):
    print('SFT dataset indiriliyor...')
    ds = load_dataset('cenfis/alpaca-turkish-combined')
    with open(SFT_JSONL, 'w', encoding='utf-8') as f:
        for row in ds['train']:
            user = row['instruction']
            if row.get('input'):
                user += f"\n{row['input']}"
            if user and row['output']:
                f.write(json.dumps({'user': user, 'assistant': row['output']}, ensure_ascii=False) + '\n')
    print(f'‚úÖ SFT dataset hazƒ±r: {SFT_JSONL}')
else:
    print(f'‚úÖ SFT dataset mevcut: {SFT_JSONL}')

In [None]:
# SFT Eƒüitimi
import subprocess
result = subprocess.run(
    ['python', 'sft/train_sft.py'],
    capture_output=False,
    text=True,
    cwd='/content/Luna-LM'
)

# NOT: train_sft.py'deki BATCH_SIZE'ƒ± H100 i√ßin 32 yap
# ve en son pretrain checkpoint'ini doƒüru g√∂ster

In [None]:
# Modeli Drive'a Kopyala (session bitince korunur)
!cp -r /content/Luna-LM/checkpoints/sft_* '{DRIVE_DIR}/'
print('‚úÖ SFT model Drive\'a kopyalandƒ±!')

# zip'le ve indir (isteƒüe baƒülƒ±)
# !zip -r /content/luna_large_sft.zip '{save_dir}/best_model.pt'
# from google.colab import files
# files.download('/content/luna_large_sft.zip')

In [None]:
# Hƒ±zlƒ± Test
from luna.utils import load_model
from luna.generate import generate_text

SYSTEM_PROMPT = ('Senin adƒ±n Luna. Amacƒ±n insanlara yardƒ±mcƒ± olmak ve sorulara a√ßƒ±k, '
                 'anla≈üƒ±lƒ±r cevaplar vermektir. Emin olmadƒ±ƒüƒ±n konularda bunu belirtir, '
                 'uydurma bilgi eklemezsin.')

def chat(model, tokenizer, device, question, **kwargs):
    prompt = f'<system>{SYSTEM_PROMPT}</system>\n<user>{question}</user>\n<assistant>'
    out = generate_text(model, tokenizer, device, prompt, max_new_tokens=200,
                        temperature=0.7, top_k=50, repetition_penalty=1.2, **kwargs)
    if '<assistant>' in out:
        ans = out.split('<assistant>')[-1]
        for s in ['</assistant>', '<user>', '<system>', '[SEP]']:
            ans = ans.split(s)[0]
        return ans.strip()
    return out

# En son SFT checkpoint'i y√ºkle
import glob
sft_dirs = sorted(glob.glob('/content/Luna-LM/checkpoints/sft_*'))
model_t, tokenizer_t, _ = load_model(sft_dirs[-1], device)

test_questions = [
    'G√ºne≈ü hangi y√∂nden doƒüar?',
    'Yapay zeka nedir?',
    'T√ºrkiye\'nin ba≈ükenti neresidir?',
]

for q in test_questions:
    print(f'‚ùì {q}')
    print(f'ü§ñ {chat(model_t, tokenizer_t, device, q)}\n')