# Deep Past - Two-Stage ByT5 Training

**Stage 1:** Train on ALL Akkadian data (general knowledge)  
**Stage 2:** Finetune on Old Assyrian only (specialization)

In [None]:
!pip install -q sacrebleu

import warnings
warnings.simplefilter("ignore")

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, T5ForConditionalGeneration
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re
import math
from pathlib import Path
from sklearn.model_selection import train_test_split
import sacrebleu

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDevice: cuda


In [None]:
# ===========================================
# SETTINGS
# ===========================================

# Model
MODEL_NAME = "google/byt5-small"
STAGE1_SAVE = "./byt5_akkadian_small"
STAGE2_SAVE = "./byt5_akkadian_small_final"

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Data paths (Colab/Drive)
DATA_DIR = "/content/drive/MyDrive/kaggle/deep-past-data"
KAGGLE_PATH = f"{DATA_DIR}/train.csv"
EXTRACTED_PATH = f"{DATA_DIR}/akk_pairs_final.csv"
AKKADEMIA_PATH = f"{DATA_DIR}/akkademia"
ORACC_PATH = f"{DATA_DIR}/oracc_akkadian_english.csv"
LEXICON_PATH = f"{DATA_DIR}/OA_Lexicon_eBL.csv"

# Data switches
USE_KAGGLE = True
USE_EXTRACTED = False
USE_AKKADEMIA = True
USE_ORACC = False
USE_LEXICON = True

# ORACC period filter for Stage 2
ORACC_OLD_ASSYRIAN_ONLY = ['Old Assyrian']

# Stage 1: General Akkadian
STAGE1_EPOCHS = 10
STAGE1_LR = 2e-4

# Stage 2: Old Assyrian specialization
STAGE2_EPOCHS = 5
STAGE2_LR = 5e-5

# Common settings
MAX_LEN = 1024
MAX_SOURCE_LEN = 1024  # Akkadian input (longer)
MAX_TARGET_LEN = 1024   # English output (usually shorter)
BATCH_SIZE = 20
GRAD_ACC = 2
PATIENCE = 3
BEAM_WIDTH = 4
REP_PENALTY = 1.2


SEED = 42
TEST_SIZE = 0.1
PREFIX = "translate Akkadian to English: "
LOG_EVERY = 50
EVAL_EVERY = 200


EVAL_BEAM_WIDTH = 2      # Greedy for speed
EVAL_MAX_LEN = 1024
EVAL_BATCH_SIZE = 8
MAX_EVAL_SAMPLES = 300   # Don't eval on full val set

torch.manual_seed(SEED)
np.random.seed(SEED)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ===========================================
# LEXICON
# ===========================================

PN_NAMES = set()
GN_NAMES = set()

if USE_LEXICON:
    lex = pd.read_csv(LEXICON_PATH)
    PN_NAMES = set(lex[lex['type'] == 'PN']['form'].str.lower().dropna())
    GN_NAMES = set(lex[lex['type'] == 'GN']['form'].str.lower().dropna())
    print(f"Lexicon loaded: {len(PN_NAMES)} person names, {len(GN_NAMES)} place names")

Lexicon loaded: 13046 person names, 328 place names


In [None]:
# ===========================================
# PREPROCESSING
# ===========================================

ASCII_TO_DIACRITIC = {
    "sz": "š", "SZ": "Š", "Sz": "Š", "sh": "š", "SH": "Š", "Sh": "Š",
    "s,": "ṣ", "S,": "Ṣ", "t,": "ṭ", "T,": "Ṭ", "z,": "ẓ", "Z,": "Ẓ",
    ".s": "ṣ", ".S": "Ṣ", ".t": "ṭ", ".T": "Ṭ", ".z": "ẓ", ".Z": "Ẓ",
    "h,": "ḫ", "H,": "Ḫ", ".h": "ḫ", ".H": "Ḫ", "hh": "ḫ", "HH": "Ḫ",
    "s2": "š", "S2": "Š", "s3": "ś", "S3": "Ś",
    "a2": "á", "a3": "à", "e2": "é", "e3": "è",
    "i2": "í", "i3": "ì", "u2": "ú", "u3": "ù",
}

SUBSCRIPTS = {'₀':'0', '₁':'1', '₂':'2', '₃':'3', '₄':'4',
              '₅':'5', '₆':'6', '₇':'7', '₈':'8', '₉':'9', 'ₓ':'x'}

def normalize_ascii(text):
    if not text:
        return text
    for k, v in sorted(ASCII_TO_DIACRITIC.items(), key=lambda x: -len(x[0])):
        text = text.replace(k, v)
    for k, v in SUBSCRIPTS.items():
        text = text.replace(k, v)
    return text

def normalize_gaps(text):
    if not text:
        return text
    tokens = text.split()
    result = []
    i = 0
    while i < len(tokens):
        if tokens[i].lower() == "x":
            count = 1
            while i + count < len(tokens) and tokens[i + count].lower() == "x":
                count += 1
            result.append("<gap>" if count == 1 else "<big_gap>")
            i += count
        else:
            t = tokens[i]
            if t.lower().startswith("x-"):
                t = "<gap>" + t[1:]
            elif t.lower().endswith("-x"):
                t = t[:-1] + "-<gap>"
            result.append(t)
            i += 1
    text = " ".join(result)
    text = re.sub(r"(<gap>\s*){2,}", "<big_gap> ", text)
    text = re.sub(r"\.\.\.+", " <big_gap> ", text)
    return text.strip()

def tag_names(text):
    if not USE_LEXICON or not text:
        return text
    words = text.split()
    result = []
    for w in words:
        key = w.replace("-", "").lower()
        if key in PN_NAMES:
            result.append(f"[PN]{w}[/PN]")
        elif key in GN_NAMES:
            result.append(f"[GN]{w}[/GN]")
        else:
            result.append(w)
    return " ".join(result)

def clean_akkadian(text):
    if pd.isna(text) or not str(text).strip():
        return ""
    text = str(text)
    text = text.replace("!", "").replace("?", "")
    text = re.sub(r"[˹˺]", "", text)
    text = re.sub(r"\[([^\]]+)\]", r"\1", text)
    text = normalize_ascii(text)
    text = normalize_gaps(text)
    text = tag_names(text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def clean_english(text):
    if pd.isna(text) or not str(text).strip():
        return ""
    text = str(text)
    text = normalize_gaps(text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Test
print("Test: 'a-na A-szur' ->", clean_akkadian("a-na A-szur"))

Test: 'a-na A-szur' -> a-na A-šur


In [None]:
# ===========================================
# LOAD DATA
# ===========================================

# We'll load into two lists:
# - old_assyrian_data: For Stage 2 (and included in Stage 1)
# - other_akkadian_data: For Stage 1 only

old_assyrian_data = []
other_akkadian_data = []

# Kaggle data (Old Assyrian)
if USE_KAGGLE:
    df = pd.read_csv(KAGGLE_PATH)
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Kaggle"):
        src = clean_akkadian(row['transliteration'])
        tgt = clean_english(row['translation'])
        if src and tgt and len(src) > 10 and len(tgt) > 10:
            old_assyrian_data.append({'source': src, 'target': tgt})
    print(f"Kaggle (Old Assyrian): {len(old_assyrian_data)} pairs")

# Extracted data (Old Assyrian)
if USE_EXTRACTED:
    df = pd.read_csv(EXTRACTED_PATH)
    count_before = len(old_assyrian_data)
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Extracted"):
        src = clean_akkadian(row['translit_clean'])
        tgt = clean_english(row['translation_en'])
        if src and tgt and len(src) > 10 and len(tgt) > 10:
            old_assyrian_data.append({'source': src, 'target': tgt})
    print(f"Extracted (Old Assyrian): {len(old_assyrian_data) - count_before} pairs")

# Akkademia data (mixed periods - goes to other)
if USE_AKKADEMIA:
    for split in ['train', 'valid', 'test']:
        translits = open(f"{AKKADEMIA_PATH}/{split}.tr").read().splitlines()
        transls = open(f"{AKKADEMIA_PATH}/{split}.en").read().splitlines()
        for tr, en in zip(translits, transls):
            src = clean_akkadian(tr)
            tgt = clean_english(en)
            if src and tgt and len(src) > 10 and len(tgt) > 10:
                other_akkadian_data.append({'source': src, 'target': tgt})
    print(f"Akkademia (other periods): {len(other_akkadian_data)} pairs")

# ORACC data (has period tags!)
if USE_ORACC:
    df = pd.read_csv(ORACC_PATH)
    print(f"ORACC periods available: {df['period'].unique().tolist()}")

    oracc_oa_count = 0
    oracc_other_count = 0

    for _, row in tqdm(df.iterrows(), total=len(df), desc="ORACC"):
        src = clean_akkadian(row['transliteration'])
        tgt = clean_english(row['translation'])
        period = row.get('period', '')

        if src and tgt and len(src) > 10 and len(tgt) > 10:
            # Check if Old Assyrian
            if ORACC_OLD_ASSYRIAN_ONLY and period in ORACC_OLD_ASSYRIAN_ONLY:
                old_assyrian_data.append({'source': src, 'target': tgt})
                oracc_oa_count += 1
            else:
                other_akkadian_data.append({'source': src, 'target': tgt})
                oracc_other_count += 1

    print(f"ORACC Old Assyrian: {oracc_oa_count} pairs")
    print(f"ORACC other periods: {oracc_other_count} pairs")

print(f"\n" + "="*50)
print(f"Old Assyrian total: {len(old_assyrian_data)} pairs")
print(f"Other Akkadian total: {len(other_akkadian_data)} pairs")
print(f"="*50)

Kaggle:   0%|          | 0/1561 [00:00<?, ?it/s]

Kaggle (Old Assyrian): 1560 pairs
Akkademia (other periods): 46785 pairs

Old Assyrian total: 1560 pairs
Other Akkadian total: 46785 pairs


In [None]:

# ===========================================
# PREPARE DATASETS
# ===========================================

# Stage 1: All data combined
stage1_data = old_assyrian_data + other_akkadian_data
stage1_df = pd.DataFrame(stage1_data).drop_duplicates(subset=['source', 'target']).reset_index(drop=True)

# Stage 2: Old Assyrian only
stage2_df = pd.DataFrame(old_assyrian_data).drop_duplicates(subset=['source', 'target']).reset_index(drop=True)

# Splits
stage1_train, stage1_val = train_test_split(stage1_df, test_size=TEST_SIZE, random_state=SEED)
stage2_train, stage2_val = train_test_split(stage2_df, test_size=TEST_SIZE, random_state=SEED)

stage1_train = stage1_train.reset_index(drop=True)
stage1_val = stage1_val.reset_index(drop=True)
stage2_train = stage2_train.reset_index(drop=True)
stage2_val = stage2_val.reset_index(drop=True)

print(f"Stage 1 (All Akkadian): {len(stage1_train)} train, {len(stage1_val)} val")
print(f"Stage 2 (Old Assyrian): {len(stage2_train)} train, {len(stage2_val)} val")

Stage 1 (All Akkadian): 41200 train, 4578 val
Stage 2 (Old Assyrian): 1404 train, 156 val


In [None]:
# See how long your data actually is
stage1_df['src_bytes'] = stage1_df['source'].apply(lambda x: len(x.encode('utf-8')))
stage1_df['tgt_bytes'] = stage1_df['target'].apply(lambda x: len(x.encode('utf-8')))

print("Source bytes:")
print(stage1_df['src_bytes'].describe())
print(f"\n% over 512: {(stage1_df['src_bytes'] > 512).mean()*100:.1f}%")
print(f"% over 1024: {(stage1_df['src_bytes'] > 1024).mean()*100:.1f}%")

print("\nTarget bytes:")
print(stage1_df['tgt_bytes'].describe())
print(f"\n% over 512: {(stage1_df['tgt_bytes'] > 512).mean()*100:.1f}%")

Source bytes:
count    45778.000000
mean       117.557145
std        159.301287
min         11.000000
25%         29.000000
50%         58.000000
75%        137.000000
max       3139.000000
Name: src_bytes, dtype: float64

% over 512: 3.4%
% over 1024: 0.3%

Target bytes:
count    45778.000000
mean       121.912163
std        183.157269
min         11.000000
25%         28.000000
50%         58.000000
75%        140.000000
max       4257.000000
Name: tgt_bytes, dtype: float64

% over 512: 3.8%


In [None]:
# ===========================================
# MODEL & DATASET CLASS
# ===========================================

print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)
print(f"Parameters: {model.num_parameters()/1e6:.1f}M")

class AkkDataset(Dataset):
    def __init__(self, df, tokenizer, max_source_len, max_target_len, prefix):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_source_len = max_source_len
        self.max_target_len = max_target_len
        self.prefix = prefix

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        src = self.tokenizer(
            self.prefix + row['source'],
            max_length=self.max_source_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        tgt = self.tokenizer(
            row['target'],
            max_length=self.max_target_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = tgt['input_ids'].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': src['input_ids'].squeeze(),
            'attention_mask': src['attention_mask'].squeeze(),
            'labels': labels
        }

Loading google/byt5-small...


tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Parameters: 299.6M


In [None]:
# ===========================================
# TRAINING FUNCTIONS
# ===========================================

@torch.no_grad()
def compute_metrics(model, loader, tokenizer, max_samples=300):
    """Fast evaluation."""
    model.eval()
    preds, refs = [], []
    samples_seen = 0

    for batch in tqdm(loader, desc="Eval"):
        if samples_seen >= max_samples:
            break

        inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != 'labels'}

        outputs = model.generate(
            **inputs,
            max_length=EVAL_MAX_LEN,
            num_beams=EVAL_BEAM_WIDTH,
            repetition_penalty=REP_PENALTY,
        )

        preds.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))

        labels = batch['labels'].clone()
        labels[labels == -100] = tokenizer.pad_token_id
        refs.extend(tokenizer.batch_decode(labels, skip_special_tokens=True))

        samples_seen += len(batch['labels'])

    bleu = sacrebleu.corpus_bleu(preds, [refs]).score
    chrf = sacrebleu.corpus_chrf(preds, [refs], word_order=2).score
    geo_mean = (bleu * chrf) ** 0.5

    return bleu, chrf, geo_mean


def train_stage(model, tokenizer, train_df, val_df, epochs, lr, save_path, stage_name):
    """Train one stage."""

    # Create dataloaders
    train_ds =AkkDataset(train_df, tokenizer, MAX_SOURCE_LEN, MAX_TARGET_LEN, PREFIX)
    val_ds = AkkDataset(val_df, tokenizer, MAX_SOURCE_LEN, MAX_TARGET_LEN, PREFIX)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

    # Setup
    loss_fn = nn.CrossEntropyLoss(ignore_index=-100, label_smoothing=0.01)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    num_steps = math.ceil(len(train_loader) / GRAD_ACC) * epochs
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_steps, eta_min=0)

    best_geo_mean = 0
    patience_counter = 0
    global_step = 0

    print("\n" + "="*60)
    print(f"{stage_name}")
    print(f"Train: {len(train_df)}, Val: {len(val_df)}, LR: {lr}, Epochs: {epochs}")
    print("="*60)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()

        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
        for step, batch in enumerate(pbar):
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'],
                        labels=batch['labels'], use_cache=False)
            loss = loss_fn(out.logits.view(-1, out.logits.size(-1)), batch['labels'].view(-1))
            loss = loss / GRAD_ACC
            loss.backward()
            total_loss += loss.item()

            if (step + 1) % GRAD_ACC == 0 or (step + 1) == len(train_loader):
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1

                pbar.set_postfix({'loss': f'{loss.item()*GRAD_ACC:.4f}', 'lr': f'{scheduler.get_last_lr()[0]:.2e}'})

        # End of epoch - eval only here
        train_loss = total_loss / len(train_loader)
        bleu, chrf, geo_mean = compute_metrics(model, val_loader, tokenizer)

        print(f"\nEpoch {epoch+1}: loss={train_loss:.4f} | BLEU={bleu:.2f} | chrF++={chrf:.2f} | GeoMean={geo_mean:.2f}")

        if geo_mean > best_geo_mean:
            best_geo_mean = geo_mean
            patience_counter = 0
            model.save_pretrained(save_path)
            tokenizer.save_pretrained(save_path)
            print(f"  ✓ Saved!")
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print("  Early stopping!")
                break

    print(f"\n{stage_name} complete! Best GeoMean: {best_geo_mean:.2f}")
    return best_geo_mean

In [None]:

# STAGE 1: TRAIN ON ALL AKKADIAN


stage1_best = train_stage(
    model, tokenizer,
    stage1_train, stage1_val,
    epochs=STAGE1_EPOCHS,
    lr=STAGE1_LR,
    save_path=STAGE1_SAVE,
    stage_name="STAGE 1: General Akkadian"
)


STAGE 1: General Akkadian
Train: 44107, Val: 4901, LR: 0.0002, Epochs: 10


Epoch 1/10:   0%|          | 0/2206 [00:00<?, ?it/s]

Eval:   0%|          | 0/246 [00:00<?, ?it/s]


Epoch 1: loss=0.6061 | BLEU=11.21 | chrF++=26.68 | GeoMean=17.29
  ✓ Saved!


Epoch 2/10:   0%|          | 0/2206 [00:00<?, ?it/s]

Eval:   0%|          | 0/246 [00:00<?, ?it/s]


Epoch 2: loss=0.4544 | BLEU=19.50 | chrF++=35.03 | GeoMean=26.13
  ✓ Saved!


Epoch 3/10:   0%|          | 0/2206 [00:00<?, ?it/s]

Eval:   0%|          | 0/246 [00:00<?, ?it/s]


Epoch 3: loss=0.4026 | BLEU=20.68 | chrF++=37.74 | GeoMean=27.94
  ✓ Saved!


Epoch 4/10:   0%|          | 0/2206 [00:00<?, ?it/s]

Eval:   0%|          | 0/246 [00:00<?, ?it/s]


Epoch 4: loss=0.3717 | BLEU=22.90 | chrF++=40.92 | GeoMean=30.61
  ✓ Saved!


Epoch 5/10:   0%|          | 0/2206 [00:00<?, ?it/s]

Eval:   0%|          | 0/246 [00:00<?, ?it/s]


Epoch 5: loss=0.3496 | BLEU=26.82 | chrF++=44.47 | GeoMean=34.54
  ✓ Saved!


Epoch 6/10:   0%|          | 0/2206 [00:00<?, ?it/s]

Eval:   0%|          | 0/246 [00:00<?, ?it/s]


Epoch 6: loss=0.3337 | BLEU=27.58 | chrF++=44.72 | GeoMean=35.12
  ✓ Saved!


Epoch 7/10:   0%|          | 0/2206 [00:00<?, ?it/s]

Eval:   0%|          | 0/246 [00:00<?, ?it/s]


Epoch 7: loss=0.3225 | BLEU=30.99 | chrF++=46.84 | GeoMean=38.10
  ✓ Saved!


Epoch 8/10:   0%|          | 0/2206 [00:00<?, ?it/s]

Eval:   0%|          | 0/246 [00:00<?, ?it/s]


Epoch 8: loss=0.3157 | BLEU=30.71 | chrF++=47.40 | GeoMean=38.16
  ✓ Saved!


Epoch 9/10:   0%|          | 0/2206 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:

# STAGE 2: FINETUNE ON OLD ASSYRIAN


# Load best model from Stage 1
STAGE1_SAVE = "/content/byt5_akkadian_small_final/"
print(f"Loading Stage 1 model from {STAGE1_SAVE}...")
model = T5ForConditionalGeneration.from_pretrained(STAGE1_SAVE).to(DEVICE)



The module name  (originally ) is not a valid Python identifier. Please rename the original module to avoid import issues.


Loading Stage 1 model from /content/byt5_akkadian_small_final/...


In [None]:
stage2_best = train_stage(
    model, tokenizer,
    stage2_train, stage2_val,
    epochs=10,
    lr=STAGE2_LR,
    save_path=STAGE2_SAVE,
    stage_name="STAGE 2: Old Assyrian Specialization"
)


STAGE 2: Old Assyrian Specialization
Train: 1404, Val: 156, LR: 5e-05, Epochs: 10


Epoch 1/10:   0%|          | 0/71 [00:00<?, ?it/s]

Eval:   0%|          | 0/8 [00:00<?, ?it/s]


Epoch 1: loss=0.2804 | BLEU=28.72 | chrF++=48.45 | GeoMean=37.30
  ✓ Saved!


Epoch 2/10:   0%|          | 0/71 [00:00<?, ?it/s]

Eval:   0%|          | 0/8 [00:00<?, ?it/s]


Epoch 2: loss=0.2767 | BLEU=29.34 | chrF++=48.74 | GeoMean=37.82
  ✓ Saved!


Epoch 3/10:   0%|          | 0/71 [00:00<?, ?it/s]

Eval:   0%|          | 0/8 [00:00<?, ?it/s]


Epoch 3: loss=0.2730 | BLEU=29.22 | chrF++=49.19 | GeoMean=37.91
  ✓ Saved!


Epoch 4/10:   0%|          | 0/71 [00:00<?, ?it/s]

Eval:   0%|          | 0/8 [00:00<?, ?it/s]


Epoch 4: loss=0.2702 | BLEU=29.18 | chrF++=49.23 | GeoMean=37.90


Epoch 5/10:   0%|          | 0/71 [00:00<?, ?it/s]

Eval:   0%|          | 0/8 [00:00<?, ?it/s]


Epoch 5: loss=0.2684 | BLEU=28.94 | chrF++=49.58 | GeoMean=37.88


Epoch 6/10:   0%|          | 0/71 [00:00<?, ?it/s]

Eval:   0%|          | 0/8 [00:00<?, ?it/s]


Epoch 6: loss=0.2646 | BLEU=28.50 | chrF++=48.59 | GeoMean=37.21
  Early stopping!

STAGE 2: Old Assyrian Specialization complete! Best GeoMean: 37.91
