# Deep Past - Multi-Architecture 9-Model Ensemble

**Models**: ByT5-small (4) + mT5-small (2) + T5-small (1) + NLLB-200 (2)

Run with `MODEL_GROUP = 1, 2, or 3` to train different sets (~3h each)

In [None]:
# ========== Configuration ==========
MODEL_GROUP = 1  # 1: M1-M3 (ByT5), 2: M4-M6 (ByT5+mT5), 3: M7-M9 (T5+NLLB)

import os, gc, re, unicodedata
import numpy as np
import pandas as pd
import torch
from pathlib import Path
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
)
import evaluate

In [None]:
# Multi-architecture model configurations
CONFIGS = {
    # ByT5-small variants (4 models, ~4h total)
    1: {'name': 'M1_byt5_baseline', 'model': 'google/byt5-small', 'epochs': 20, 'lr': 1e-4, 'bs': 1, 'ga': 8, 'bidir': False, 'max_len': 512},
    2: {'name': 'M2_byt5_bidir', 'model': 'google/byt5-small', 'epochs': 20, 'lr': 1e-4, 'bs': 1, 'ga': 8, 'bidir': True, 'max_len': 512},
    3: {'name': 'M3_byt5_long', 'model': 'google/byt5-small', 'epochs': 30, 'lr': 1e-4, 'bs': 1, 'ga': 8, 'bidir': False, 'max_len': 512},
    4: {'name': 'M4_byt5_full', 'model': 'google/byt5-small', 'epochs': 20, 'lr': 1e-4, 'bs': 1, 'ga': 8, 'bidir': True, 'max_len': 512},
    
    # mT5-small variants (2 models, ~1.5h total)
    5: {'name': 'M5_mt5_baseline', 'model': 'google/mt5-small', 'epochs': 20, 'lr': 1e-4, 'bs': 2, 'ga': 4, 'bidir': False, 'max_len': 512},
    6: {'name': 'M6_mt5_bidir', 'model': 'google/mt5-small', 'epochs': 20, 'lr': 1e-4, 'bs': 2, 'ga': 4, 'bidir': True, 'max_len': 512},
    
    # T5-small (1 model, ~0.5h)
    7: {'name': 'M7_t5_small', 'model': 'google/t5-small', 'epochs': 25, 'lr': 1e-4, 'bs': 4, 'ga': 2, 'bidir': False, 'max_len': 512},
    
    # NLLB-200 (2 models, ~3h total)
    8: {'name': 'M8_nllb', 'model': 'facebook/nllb-200-distilled-600M', 'epochs': 15, 'lr': 5e-5, 'bs': 1, 'ga': 8, 'bidir': False, 'max_len': 256},
    9: {'name': 'M9_nllb_bidir', 'model': 'facebook/nllb-200-distilled-600M', 'epochs': 15, 'lr': 5e-5, 'bs': 1, 'ga': 8, 'bidir': True, 'max_len': 256},
}

# Select models for this run
GROUP_MAP = {1: [1, 2, 3], 2: [4, 5, 6], 3: [7, 8, 9]}
MODEL_IDS = GROUP_MAP[MODEL_GROUP]
print(f"Training: {[CONFIGS[i]['name'] for i in MODEL_IDS]}")

In [None]:
# ========== Data Preprocessing ==========
def normalize_text(text):
    if pd.isna(text): return ""
    text = str(text).strip()
    text = text.translate(str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789"))
    text = unicodedata.normalize('NFKD', text)
    text = re.sub(r'\.{3,}|…+', ' <big_gap> ', text)
    text = re.sub(r'xx+|\s+x\s+', ' <gap> ', text, flags=re.I)
    text = re.sub(r'[\[\]<>⌈⌋⌊]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

def sentence_align(df):
    aligned = []
    for _, row in df.iterrows():
        src, tgt = str(row.get('transliteration', '')), str(row.get('translation', ''))
        tgt_sents = [t.strip() for t in re.split(r'(?<=[.!?])\s+', tgt) if t.strip()]
        src_lines = [s.strip() for s in src.split('\n') if s.strip()]
        if len(tgt_sents) > 1 and len(tgt_sents) == len(src_lines):
            for s, t in zip(src_lines, tgt_sents):
                if len(s) > 3 and len(t) > 3:
                    aligned.append({'transliteration': normalize_text(s), 'translation': t.strip()})
        else:
            aligned.append({'transliteration': normalize_text(src), 'translation': tgt.strip()})
    return pd.DataFrame(aligned)

def create_bidirectional(df):
    fwd = df.copy()
    fwd['input_text'] = "translate Akkadian to English: " + fwd['transliteration'].astype(str)
    fwd['target_text'] = fwd['translation'].astype(str)
    bwd = df.copy()
    bwd['input_text'] = "translate English to Akkadian: " + bwd['translation'].astype(str)
    bwd['target_text'] = bwd['transliteration'].astype(str)
    return pd.concat([fwd, bwd], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

def load_data(use_bidir=False):
    data_path = Path('/kaggle/input/deep-past-initiative-machine-translation')
    df = sentence_align(pd.read_csv(data_path / 'train.csv'))
    print(f"After alignment: {len(df)}")
    if use_bidir:
        df = create_bidirectional(df)
        print(f"After bidirectional: {len(df)}")
    else:
        df['input_text'] = "translate Akkadian to English: " + df['transliteration'].astype(str)
        df['target_text'] = df['translation'].astype(str)
    return df[['input_text', 'target_text']]

In [None]:
# ========== Training Function ==========
def train_model(cfg):
    print(f"\n{'='*60}\nTraining: {cfg['name']} ({cfg['model']})\n{'='*60}")
    gc.collect(); torch.cuda.empty_cache()
    
    df = load_data(use_bidir=cfg['bidir'])
    dataset = Dataset.from_pandas(df)
    split = dataset.train_test_split(test_size=0.1, seed=42)
    
    tokenizer = AutoTokenizer.from_pretrained(cfg['model'])
    model = AutoModelForSeq2SeqLM.from_pretrained(cfg['model'])
    
    def preprocess(examples):
        inputs = [str(x) for x in examples['input_text']]
        targets = [str(x) for x in examples['target_text']]
        model_inputs = tokenizer(inputs, max_length=cfg['max_len'], truncation=True)
        labels = tokenizer(targets, max_length=cfg['max_len'], truncation=True)
        model_inputs['labels'] = labels['input_ids']
        return model_inputs
    
    train_tok = split['train'].map(preprocess, batched=True, remove_columns=split['train'].column_names)
    val_tok = split['test'].map(preprocess, batched=True, remove_columns=split['test'].column_names)
    
    metric_chrf = evaluate.load('chrf')
    metric_bleu = evaluate.load('sacrebleu')
    
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple): preds = preds[0]
        if hasattr(preds, 'ndim') and preds.ndim == 3: preds = np.argmax(preds, axis=-1)
        preds = np.clip(preds.astype(np.int64), 0, tokenizer.vocab_size - 1)
        dec_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        dec_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        chrf = metric_chrf.compute(predictions=dec_preds, references=dec_labels)['score']
        bleu = metric_bleu.compute(predictions=dec_preds, references=[[x] for x in dec_labels])['score']
        return {'chrf': chrf, 'bleu': bleu, 'geo_mean': (chrf*bleu)**0.5 if chrf>0 and bleu>0 else 0}
    
    output_dir = f"./models/{cfg['name']}"
    args = Seq2SeqTrainingArguments(
        output_dir=output_dir, eval_strategy='epoch', save_strategy='epoch',
        learning_rate=cfg['lr'], optim='adafactor', label_smoothing_factor=0.2,
        fp16=False, per_device_train_batch_size=cfg['bs'], per_device_eval_batch_size=cfg['bs'],
        gradient_accumulation_steps=cfg['ga'], weight_decay=0.01, save_total_limit=1,
        num_train_epochs=cfg['epochs'], predict_with_generate=True, logging_steps=50,
        report_to='none', load_best_model_at_end=True, metric_for_best_model='geo_mean', greater_is_better=True,
    )
    
    trainer = Seq2SeqTrainer(
        model=model, args=args, train_dataset=train_tok, eval_dataset=val_tok,
        data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
        tokenizer=tokenizer, compute_metrics=compute_metrics,
    )
    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Saved: {output_dir}")
    del model, trainer; gc.collect(); torch.cuda.empty_cache()

In [None]:
# ========== Train Models ==========
for mid in MODEL_IDS:
    train_model(CONFIGS[mid])

print("\n" + "="*60 + "\n✅ Training Complete!\n" + "="*60)

In [None]:
!ls -la ./models/