# üåô Luna-LM Large ‚Äî H100 Training (v2)

**Model:** Large (~350M params) ‚Äî emb_dim=1024, n_heads=16, n_layers=24  
**GPU:** H100 (80GB VRAM)  
**Pipeline:** Pretrain ‚Üí SFT

### D√ºzeltmeler (v2)
- `BATCH_SIZE`: 32 ‚Üí 8 (OOM d√ºzeltmesi)
- `CONTEXT_LEN`: 1024 ‚Üí 512 (bellek tasarrufu)
- `torch.compile` kaldƒ±rƒ±ldƒ± (OOM kaynaƒüƒ±)
- `autocast` deprecated uyarƒ±sƒ± d√ºzeltildi
- `gradient_accumulation_steps=4` eklendi (etkin batch=32)

In [None]:
# 1. GPU Doƒürula
import torch
print(f'GPU: {torch.cuda.get_device_name(0)}')
print(f'VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB')
print(f'BF16 destekli: {torch.cuda.is_bf16_supported()}')
assert torch.cuda.is_available(), '‚ùå GPU yok! Runtime > Change runtime type > H100'
print('‚úÖ GPU hazƒ±r!')

In [None]:
# 2. Google Drive Baƒüla
from google.colab import drive
drive.mount('/content/drive')

import os
DRIVE_DIR = '/content/drive/MyDrive/LunaLM'
os.makedirs(DRIVE_DIR, exist_ok=True)
print(f'‚úÖ Drive: {DRIVE_DIR}')

In [None]:
# 3. Repo Kur
import sys, os
if not os.path.exists('/content/Luna-LM'):
    !git clone https://github.com/iatagun/Luna-LM.git /content/Luna-LM
else:
    print('Repo zaten mevcut, g√ºncelleniyor...')
    !cd /content/Luna-LM && git pull

sys.path.insert(0, '/content/Luna-LM')
os.chdir('/content/Luna-LM')
print('‚úÖ Repo hazƒ±r!')

In [None]:
# 4. Baƒüƒ±mlƒ±lƒ±klarƒ± Y√ºkle
!pip install transformers datasets

# Kurulumu doƒürula
import transformers, datasets as ds_lib
print(f'‚úÖ transformers {transformers.__version__}')
print(f'‚úÖ datasets     {ds_lib.__version__}')

# Uyarƒ±larƒ± bastƒ±r
import warnings
warnings.filterwarnings('ignore', message='Token indices sequence length')
transformers.logging.set_verbosity_error()

In [None]:
# 5. Corpus Y√ºkle
CORPUS_DRIVE = '/content/drive/MyDrive/LunaLM/foundation_corpus_clean.txt'
CORPUS_LOCAL = '/content/Luna-LM/foundation_corpus_clean.txt'

if not os.path.exists(CORPUS_LOCAL):
    if os.path.exists(CORPUS_DRIVE):
        print('Corpus kopyalanƒ±yor...')
        import shutil
        shutil.copy(CORPUS_DRIVE, CORPUS_LOCAL)
    else:
        print('‚ùå Corpus bulunamadƒ±! Drive\'a y√ºkleyin:')
        print(f'   {CORPUS_DRIVE}')

size_gb = os.path.getsize(CORPUS_LOCAL) / 1024**3
print(f'‚úÖ Corpus: {size_gb:.2f} GB')

In [None]:
# 6. AYARLAR ‚Äî OOM d√ºzeltmeleri uygulandƒ±
import torch, json, random, time, math, os
from datetime import datetime

from luna.tokenizer import PretrainedTurkishTokenizer
from luna.data import create_dataloader_pretrained
from luna.model import GPTModel, MODEL_CONFIGS
from luna.generate import generate_text

# ==========================================
# AYARLAR
# ==========================================
MODEL_SIZE  = 'large'   # emb_dim=1024, n_heads=16, n_layers=24
BATCH_SIZE  = 8         # ‚úÖ 32‚Üí8 (OOM d√ºzeltmesi)
GRAD_ACCUM  = 4         # Gradient accumulation: etkin batch = 8*4 = 32
CONTEXT_LEN = 512       # ‚úÖ 1024‚Üí512 (bellek tasarrufu)
NUM_EPOCHS  = 3
LR          = 6e-4
WD          = 0.1
EVAL_FREQ   = 500
EVAL_ITER   = 10
MAX_LINES   = None      # T√ºm corpus
USE_BF16    = True

device = torch.device('cuda')
print(f'Device: {device}')
print(f'Model:  {MODEL_SIZE}')
print(f'Batch:  {BATCH_SIZE} x {GRAD_ACCUM} accum = {BATCH_SIZE*GRAD_ACCUM} etkin')
print(f'Ctx:    {CONTEXT_LEN}')

In [None]:
# Tokenizer
tokenizer = PretrainedTurkishTokenizer('dbmdz/bert-base-turkish-cased')
vocab_size = tokenizer.vocab_size
print(f'‚úÖ Vocab: {vocab_size:,}')

In [None]:
# Corpus Y√ºkle & Split
corpus_path = CORPUS_LOCAL
print('Corpus y√ºkleniyor...')

lines = []
with open(corpus_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if MAX_LINES and i >= MAX_LINES:
            break
        line = line.strip()
        if line:
            lines.append(line)
        if (i+1) % 500_000 == 0:
            print(f'  {i+1:,} satƒ±r...')

print(f'‚úÖ {len(lines):,} satƒ±r')

random.seed(42)
random.shuffle(lines)

split      = int(0.95 * len(lines))
train_text = '\n'.join(lines[:split])
val_text   = '\n'.join(lines[split:])
del lines

print(f'Train: {len(train_text)/1e9:.2f}B karakter')
print(f'Val:   {len(val_text)/1e6:.1f}M karakter')

In [None]:
# DataLoader
print('Tokenize ediliyor (corpus b√ºy√ºk, ~20-30 dk)...')

train_loader = create_dataloader_pretrained(
    train_text, tokenizer,
    batch_size=BATCH_SIZE, max_length=CONTEXT_LEN, stride=CONTEXT_LEN, shuffle=True
)
val_loader = create_dataloader_pretrained(
    val_text, tokenizer,
    batch_size=BATCH_SIZE, max_length=CONTEXT_LEN, stride=CONTEXT_LEN, shuffle=False
)

print(f'‚úÖ Train: {len(train_loader):,} batch')
print(f'‚úÖ Val:   {len(val_loader):,} batch')

In [None]:
# Model Olu≈ütur
cfg = MODEL_CONFIGS[MODEL_SIZE]
model_config = {
    'vocab_size':     vocab_size,
    'context_length': CONTEXT_LEN,
    'emb_dim':        cfg['emb_dim'],
    'n_heads':        cfg['n_heads'],
    'n_layers':       cfg['n_layers'],
    'drop_rate':      0.1,
    'qkv_bias':       False,
}

model = GPTModel(model_config)

if USE_BF16:
    model = model.to(dtype=torch.bfloat16)

model = model.to(device)

# ‚úÖ torch.compile KALDIRILDI ‚Äî OOM kaynaƒüƒ±ydƒ±
# model = torch.compile(model)

total = sum(p.numel() for p in model.parameters())
print(f'‚úÖ Model: {total/1e6:.1f}M parametre')
print(f'   emb={cfg["emb_dim"]}, heads={cfg["n_heads"]}, layers={cfg["n_layers"]}')

# VRAM kullanƒ±mƒ±nƒ± g√∂ster
allocated = torch.cuda.memory_allocated() / 1024**3
reserved  = torch.cuda.memory_reserved()  / 1024**3
print(f'   VRAM: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved')

In [None]:
# Optimizer & Scheduler
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=LR,
    weight_decay=WD,
    betas=(0.9, 0.95),
    eps=1e-8,
    fused=True
)

total_steps  = (len(train_loader) // GRAD_ACCUM) * NUM_EPOCHS
warmup_steps = int(total_steps * 0.02)

def lr_lambda(step):
    if step < warmup_steps:
        return step / max(warmup_steps, 1)
    progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1)
    return 0.1 + 0.9 * 0.5 * (1.0 + math.cos(math.pi * progress))

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

print(f'‚úÖ Optimizer hazƒ±r')
print(f'   Total steps: {total_steps:,} ({warmup_steps} warmup)')

In [None]:
# Save Dir
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
save_dir  = f'{DRIVE_DIR}/pretrain_large_{timestamp}'
os.makedirs(save_dir, exist_ok=True)

with open(f'{save_dir}/config.json', 'w') as f:
    json.dump({
        'model_config': model_config,
        'training_config': {
            'model_size': MODEL_SIZE,
            'batch_size': BATCH_SIZE,
            'grad_accum': GRAD_ACCUM,
            'effective_batch': BATCH_SIZE * GRAD_ACCUM,
            'num_epochs': NUM_EPOCHS,
            'learning_rate': LR,
            'context_length': CONTEXT_LEN,
            'use_bf16': USE_BF16,
        },
        'tokenizer': 'dbmdz/bert-base-turkish-cased',
        'timestamp': timestamp,
    }, f, indent=2)

print(f'‚úÖ Save dir: {save_dir}')

In [None]:
# PRETRAIN LOOP ‚Äî Gradient Accumulation + BF16
# ‚úÖ torch.cuda.amp.autocast ‚Üí torch.amp.autocast (deprecated uyarƒ±sƒ± d√ºzeltildi)

train_losses, val_losses, tokens_log = [], [], []
tokens_seen   = 0
global_step   = 0
best_val_loss = float('inf')

print('='*60)
print('PRETRAIN BA≈ûLIYOR')
print(f'Grad Accum: {GRAD_ACCUM} ‚Üí Etkin batch: {BATCH_SIZE*GRAD_ACCUM}')
print('='*60)

for epoch in range(NUM_EPOCHS):
    model.train()
    ep_start = time.time()
    optimizer.zero_grad()

    for batch_idx, (input_batch, target_batch) in enumerate(train_loader):
        input_batch  = input_batch.to(device)
        target_batch = target_batch.to(device)

        # ‚úÖ Yeni autocast API
        with torch.amp.autocast('cuda', dtype=torch.bfloat16, enabled=USE_BF16):
            logits = model(input_batch)
            loss = torch.nn.functional.cross_entropy(
                logits.flatten(0, 1), target_batch.flatten()
            )
            loss = loss / GRAD_ACCUM  # Scale loss

        loss.backward()

        # Gradient accumulation
        if (batch_idx + 1) % GRAD_ACCUM == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            global_step += 1
            tokens_seen += input_batch.numel() * GRAD_ACCUM

            # Eval
            if global_step % EVAL_FREQ == 0:
                model.eval()
                with torch.no_grad():
                    tl = 0
                    for k, (ib, tb) in enumerate(train_loader):
                        if k >= EVAL_ITER: break
                        with torch.amp.autocast('cuda', dtype=torch.bfloat16, enabled=USE_BF16):
                            lg = model(ib.to(device))
                            tl += torch.nn.functional.cross_entropy(lg.flatten(0,1), tb.to(device).flatten()).item()
                    tl /= EVAL_ITER

                    vl = 0
                    for k, (ib, tb) in enumerate(val_loader):
                        if k >= EVAL_ITER: break
                        with torch.amp.autocast('cuda', dtype=torch.bfloat16, enabled=USE_BF16):
                            lg = model(ib.to(device))
                            vl += torch.nn.functional.cross_entropy(lg.flatten(0,1), tb.to(device).flatten()).item()
                    vl /= EVAL_ITER

                train_losses.append(tl)
                val_losses.append(vl)
                tokens_log.append(tokens_seen)

                lr = optimizer.param_groups[0]['lr']
                mem = torch.cuda.memory_allocated() / 1024**3
                print(f'Ep {epoch+1} | Step {global_step:,} | '
                      f'Train: {tl:.4f} | Val: {vl:.4f} | '
                      f'LR: {lr:.2e} | Tokens: {tokens_seen/1e6:.0f}M | VRAM: {mem:.1f}GB')

                if vl < best_val_loss:
                    best_val_loss = vl
                    torch.save({
                        'model_state_dict': model.state_dict(),
                        'epoch': epoch, 'step': global_step,
                        'val_loss': vl, 'tokens_seen': tokens_seen,
                    }, f'{save_dir}/best_model.pt')
                    print(f'  ‚úÖ Best! Val: {vl:.4f}')

                model.train()

    ep_time = (time.time() - ep_start) / 60
    print(f'\nEpoch {epoch+1} tamamlandƒ± ({ep_time:.0f} dk)\n')
    torch.save({'model_state_dict': model.state_dict(), 'epoch': epoch},
               f'{save_dir}/epoch_{epoch+1}.pt')

print(f'‚úÖ PRETRAIN TAMAMLANDI! Best val: {best_val_loss:.4f}')

In [None]:
# Loss Grafiƒüi
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(tokens_log, train_losses, label='Train')
ax.plot(tokens_log, val_losses, label='Val', linestyle='--')
ax.set_xlabel('Tokens Seen')
ax.set_ylabel('Loss')
ax.set_title('Luna-LM Large ‚Äî Pretrain Loss')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(f'{save_dir}/pretrain_loss.png', dpi=150)
plt.show()
print(f'‚úÖ Grafik kaydedildi: {save_dir}/pretrain_loss.png')

---
## SFT ‚Äî Alpaca Turkish (82K)

In [None]:
# SFT Dataset ƒ∞ndir
from datasets import load_dataset
import json

SFT_PATH = '/content/Luna-LM/sft/sft_dataset.jsonl'

if not os.path.exists(SFT_PATH):
    print('SFT dataset indiriliyor...')
    ds = load_dataset('cenfis/alpaca-turkish-combined')
    valid = 0
    with open(SFT_PATH, 'w', encoding='utf-8') as f:
        for row in ds['train']:
            user = row['instruction']
            if row.get('input'):
                user += f"\n{row['input']}"
            if user and row['output']:
                f.write(json.dumps(
                    {'user': user, 'assistant': row['output']},
                    ensure_ascii=False
                ) + '\n')
                valid += 1
    print(f'‚úÖ SFT: {valid:,} √∂rnek ‚Üí {SFT_PATH}')
else:
    print(f'‚úÖ SFT dataset mevcut: {SFT_PATH}')

In [None]:
# SFT i√ßin batch ayarla (train_sft.py'deki deƒüeri ge√ßersiz kƒ±l)
# train_sft.py'yi H100 i√ßin patch et
import re

sft_script = '/content/Luna-LM/sft/train_sft.py'
with open(sft_script, 'r', encoding='utf-8') as f:
    content = f.read()

# BATCH_SIZE'ƒ± 8'e, EVAL_FREQ'i 200'e ayarla
content = re.sub(r'BATCH_SIZE\s*=\s*\d+', 'BATCH_SIZE = 8', content)
content = re.sub(r'EVAL_FREQ\s*=\s*\d+', 'EVAL_FREQ = 200', content)

with open(sft_script, 'w', encoding='utf-8') as f:
    f.write(content)

print('‚úÖ train_sft.py g√ºncellendi (BATCH=8, EVAL_FREQ=200)')

In [None]:
# SFT Eƒüitimi √áalƒ±≈ütƒ±r
!python sft/train_sft.py

In [None]:
# Her ≈üeyi Drive'a Kaydet
import glob, shutil

# SFT checkpoints
sft_dirs = sorted(glob.glob('/content/Luna-LM/checkpoints/sft_*'))
if sft_dirs:
    dest = os.path.join(DRIVE_DIR, os.path.basename(sft_dirs[-1]))
    shutil.copytree(sft_dirs[-1], dest, dirs_exist_ok=True)
    print(f'‚úÖ SFT Drive\'a kopyalandƒ±: {dest}')

print('‚úÖ Tamamlandƒ±!')

In [None]:
# Hƒ±zlƒ± Test
from luna.utils import load_model
from luna.generate import generate_text
import glob

SYSTEM = ('Senin adƒ±n Luna. Amacƒ±n insanlara yardƒ±mcƒ± olmak ve sorulara a√ßƒ±k, '
          'anla≈üƒ±lƒ±r cevaplar vermektir.')

def chat(model, tok, dev, q):
    prompt = f'<system>{SYSTEM}</system>\n<user>{q}</user>\n<assistant>'
    out = generate_text(model, tok, dev, prompt,
                        max_new_tokens=150, temperature=0.7,
                        top_k=50, repetition_penalty=1.2)
    if '<assistant>' in out:
        ans = out.split('<assistant>')[-1]
        for s in ['</assistant>', '<user>', '<system>', '[SEP]']:
            ans = ans.split(s)[0]
        return ans.strip()
    return out

# En son SFT checkpoint
sft_dirs = sorted(glob.glob('/content/Luna-LM/checkpoints/sft_*'))
model_t, tok_t, _ = load_model(sft_dirs[-1], device)

for q in ['G√ºne≈ü hangi y√∂nden doƒüar?', 'Yapay zeka nedir?', 'T√ºrkiye\'nin ba≈ükenti?']:
    print(f'‚ùì {q}')
    print(f'ü§ñ {chat(model_t, tok_t, device, q)}\n')