# ============================================================
# 1) ENCODER-ONLY KULLANIMI
# ============================================================7

In [None]:
import hyso 

tok = hyso.build_tokenizer({"type": "bpe", "cache_size": 10000})

train_texts = [
    "Merhaba dünya!",
    "Encoder only testi.",
]
tok.fit(train_texts)

vocab_size = tok.vocab_size

enc = hyso.HysoEncoder(vocab_size=vocab_size, d_model=512, n_heads=8)

enc_texts = ["Bugün hava çok güzel.", "Hyso encoder test."]

enc_ids, enc_mask = tok.encode_for_encoder(
    enc_texts,
    max_len=128,
    pad_to_multiple_of=8,
)

# ============================================================
# 2) DECODER-ONLY (CAUSAL LM) KULLANIMI
# ===========================================================

In [None]:
tok = hyso.build_tokenizer({"type": "bpe", "cache_size": 10000})
tok.fit(train_texts)

vocab_size = tok.vocab_size

dec = hyso.HysoDecoder(vocab_size=vocab_size, d_model=768, n_heads=12)

lm_texts = ["Merhaba dünya!", "Hyso decoder only testi."]

lm_ids, lm_mask = tok.encode_for_decoder_lm(
    lm_texts,
    max_len=128,
    pad_to_multiple_of=8,
)

# ============================================================
# 3) ENCODER–DECODER (SEQ2SEQ) KULLANIMI
# ============================================================

In [3]:
tok = hyso.build_tokenizer({"type": "simple", "cache_size": 10000})
tok.fit(train_texts)

vocab_size = tok.vocab_size

ed = hyso.HysoLLM(vocab_src=vocab_size, vocab_tgt=vocab_size, d_model=512,num_layers_dec=8,num_layers_enc=6)

src_texts = ["Merhaba dünya!"]
tgt_texts = ["Hello world!"]

src_ids, src_mask = tok.encode_src_seq2seq(
    src_texts,
    max_len=128,
    pad_to_multiple_of=8,
)

tgt_ids, tgt_mask = tok.encode_tgt_seq2seq(
    tgt_texts,
    max_len=128,
    pad_to_multiple_of=8,
)

----

In [1]:
import sys
sys.path.append(r"C:\Users\hdgn5\OneDrive\Masaüstü\hysollm")

import torch
import hyso
from hyso.core.train import HysoTrainer, HysoCallbacks, LLMetrics


In [2]:
import sys
sys.path.append(r"C:\Users\hdgn5\OneDrive\Masaüstü\hysollm")

import torch
from torch.utils.data import Dataset, DataLoader

import hyso
from hyso.core.train.trainer import HysoTrainer
from hyso.core.train.metrics import LLMetrics
from hyso.core.train.callbacks import HysoCallbacks
from hyso.core.tokenizer.bpe_tokenizer import HysoBPETokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pairs = [
    ("merhaba dünya", "hello world"),
    ("nasılsın", "how are you"),
    ("bugün hava güzel", "the weather is nice today"),
    ("kütüphane dolu", "the library is full"),
] * 64

tok = HysoBPETokenizer(
    lowercase=False,
    normalize="NFKC",
    cache_size=10000,
)

corpus = [txt for (src, tgt) in pairs for txt in (src, tgt)]
tok.fit(corpus, target_vocab_size=4096, min_pair_freq=2)

print("vocab_size:", tok.vocab_size)

class DummySeq2SeqDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        return {"src": src, "tgt": tgt}

def make_seq2seq_collate(tokenizer, max_src_len=64, max_tgt_len=64,
                         pad_to_multiple_of=None, ignore_index=-100):
    pad_id = tokenizer.pad_id

    def collate(batch):
        src_texts = [item["src"] for item in batch]
        tgt_texts = [item["tgt"] for item in batch]

        src_ids, src_mask = tokenizer.encode_src_seq2seq(
            src_texts,
            max_len=max_src_len,
            pad_to_multiple_of=pad_to_multiple_of,
        )

        tgt_ids, tgt_mask = tokenizer.encode_tgt_seq2seq(
            tgt_texts,
            max_len=max_tgt_len,
            pad_to_multiple_of=pad_to_multiple_of,
        )

        labels = tgt_ids.clone()
        labels[labels == pad_id] = ignore_index
        labels[:, 0] = ignore_index

        return {
            "src_ids": src_ids,
            "src_mask": src_mask,
            "tgt_ids": tgt_ids,
            "tgt_mask": tgt_mask,
            "labels": labels,
        }

    return collate

train_ds = DummySeq2SeqDataset(pairs[:200])
val_ds   = DummySeq2SeqDataset(pairs[200:240])

collate_fn = make_seq2seq_collate(
    tokenizer=tok,
    max_src_len=64,
    max_tgt_len=64,
    pad_to_multiple_of=None,
    ignore_index=-100,
)

train_loader = DataLoader(
    train_ds,
    batch_size=8,
    shuffle=True,
    collate_fn=collate_fn,
)

val_loader = DataLoader(
    val_ds,
    batch_size=8,
    shuffle=False,
    collate_fn=collate_fn,
)

batch = next(iter(train_loader))
print({k: v.shape for k, v in batch.items()})

model = hyso.HysoLLM(
    vocab_src=tok.vocab_size,
    vocab_tgt=tok.vocab_size,
    d_model=256,
    n_heads=8,
).to(device)

epochs = 10

callbacks = HysoCallbacks.default(
    total_steps=len(train_loader) * epochs,
    save_dir="checkpoints_llm",
)

metrics = LLMetrics(
    bleu=False,
    perplexity=True,
    token_accuracy=True,
    token_ignore_index=-100,
)

trainer = HysoTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    mode="seq2seq",
    tokenizer=tok,
    epochs=epochs,
    lr=3e-4,
    weight_decay=0.0,
    optimizer="adamw",
    scheduler="none",
    warmup_ratio=0.0,
    use_amp=False,
    grad_accum_steps=1,
    callbacks=callbacks,
    metrics=metrics,
    ignore_index=-100,
    save_dir="checkpoints_llm",
    log_interval=10,
)

result = trainer.fit()
print("train_loss:", result["train_loss"])
print("val_metrics:", result["val_metrics"])


vocab_size: 355
{'src_ids': torch.Size([8, 1]), 'src_mask': torch.Size([8, 1]), 'tgt_ids': torch.Size([8, 3]), 'tgt_mask': torch.Size([8, 3]), 'labels': torch.Size([8, 3])}
Epoch 1/10 [██████████████████████████████] 100.0% loss=6.0200 epoch_eta=0.0m total_elapsed=0.1m
Epoch 1/10 finished in 0.14 minutes
Epoch 2/10 [██████████████████████████████] 100.0% loss=6.1507 epoch_eta=0.0m total_elapsed=0.3m
Epoch 2/10 finished in 0.14 minutes
Epoch 3/10 [██████████████████████████████] 100.0% loss=6.0746 epoch_eta=0.0m total_elapsed=0.4m
Epoch 3/10 finished in 0.14 minutes
Early stopping triggered.
Training finished in 0.42 minutes, total_steps=75
train_loss: 6.207448501586914
val_metrics: {'val_loss': 6.380901336669922, 'val_perplexity': 590.45967083693, 'val_token_accuracy': 0.0}
