# Deep Past Challenge - ByT5 9-Model Ensemble Training

This notebook trains 3 out of 9 ByT5-small models for the Akkadian-English translation task.
Run this notebook 3 times with different `MODEL_GROUP` values (1, 2, 3) to train all 9 models.

In [None]:
# Configuration - Change this for each run
MODEL_GROUP = 1  # 1: M1-M3, 2: M4-M6, 3: M7-M9

import os
import gc
import re
import unicodedata
import numpy as np
import pandas as pd
import torch
from pathlib import Path
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import evaluate

In [None]:
# Model configurations for all 9 models
CONFIGS = {
    1: {'name': 'M1_baseline', 'epochs': 20, 'lr': 1e-4, 'bidir': False, 'extra': False, 'label_smooth': 0.2, 'max_len': 512},
    2: {'name': 'M2_bidirectional', 'epochs': 20, 'lr': 1e-4, 'bidir': True, 'extra': False, 'label_smooth': 0.2, 'max_len': 512},
    3: {'name': 'M3_long_train', 'epochs': 30, 'lr': 1e-4, 'bidir': False, 'extra': False, 'label_smooth': 0.2, 'max_len': 512},
    4: {'name': 'M4_low_lr', 'epochs': 20, 'lr': 5e-5, 'bidir': False, 'extra': False, 'label_smooth': 0.2, 'max_len': 512},
    5: {'name': 'M5_high_lr', 'epochs': 15, 'lr': 2e-4, 'bidir': False, 'extra': False, 'label_smooth': 0.2, 'max_len': 512},
    6: {'name': 'M6_extra_data', 'epochs': 20, 'lr': 1e-4, 'bidir': False, 'extra': True, 'label_smooth': 0.2, 'max_len': 512},
    7: {'name': 'M7_no_smoothing', 'epochs': 20, 'lr': 1e-4, 'bidir': False, 'extra': False, 'label_smooth': 0.0, 'max_len': 512},
    8: {'name': 'M8_short_len', 'epochs': 20, 'lr': 1e-4, 'bidir': False, 'extra': False, 'label_smooth': 0.2, 'max_len': 256},
    9: {'name': 'M9_full_augment', 'epochs': 20, 'lr': 1e-4, 'bidir': True, 'extra': True, 'label_smooth': 0.2, 'max_len': 512},
}

# Select models for this run
if MODEL_GROUP == 1:
    MODEL_IDS = [1, 2, 3]
elif MODEL_GROUP == 2:
    MODEL_IDS = [4, 5, 6]
else:
    MODEL_IDS = [7, 8, 9]

print(f"Training models: {[CONFIGS[i]['name'] for i in MODEL_IDS]}")

In [None]:
# ========== Data Preprocessing ==========
def normalize_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    text = str(text).strip()
    subscript_map = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")
    text = text.translate(subscript_map)
    text = unicodedata.normalize('NFKD', text)
    text = re.sub(r'\.{3,}|…+', ' <big_gap> ', text)
    text = re.sub(r'xx+|\s+x\s+', ' <gap> ', text, flags=re.IGNORECASE)
    text = re.sub(r'[\[\]<>⌈⌋⌊]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def sentence_align(df):
    aligned = []
    for _, row in df.iterrows():
        src = str(row.get('transliteration', ''))
        tgt = str(row.get('translation', ''))
        tgt_sents = [t.strip() for t in re.split(r'(?<=[.!?])\s+', tgt) if t.strip()]
        src_lines = [s.strip() for s in src.split('\n') if s.strip()]
        if len(tgt_sents) > 1 and len(tgt_sents) == len(src_lines):
            for s, t in zip(src_lines, tgt_sents):
                if len(s) > 3 and len(t) > 3:
                    aligned.append({'transliteration': normalize_text(s), 'translation': t.strip()})
        else:
            aligned.append({'transliteration': normalize_text(src), 'translation': tgt.strip()})
    return pd.DataFrame(aligned)

def create_bidirectional(df):
    fwd = df.copy()
    fwd['input_text'] = "translate Akkadian to English: " + fwd['transliteration'].astype(str)
    fwd['target_text'] = fwd['translation'].astype(str)
    bwd = df.copy()
    bwd['input_text'] = "translate English to Akkadian: " + bwd['translation'].astype(str)
    bwd['target_text'] = bwd['transliteration'].astype(str)
    return pd.concat([fwd, bwd], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

def load_data(use_bidir=False, use_extra=False):
    data_path = Path('/kaggle/input/deep-past-initiative-machine-translation')
    train_df = pd.read_csv(data_path / 'train.csv')
    train_df = sentence_align(train_df)
    print(f"After alignment: {len(train_df)}")
    
    if use_bidir:
        train_df = create_bidirectional(train_df)
        print(f"After bidirectional: {len(train_df)}")
    else:
        train_df['input_text'] = "translate Akkadian to English: " + train_df['transliteration'].astype(str)
        train_df['target_text'] = train_df['translation'].astype(str)
    
    return train_df[['input_text', 'target_text']]

In [None]:
# ========== Training Function ==========
def train_model(config):
    print(f"\n{'='*60}")
    print(f"Training: {config['name']}")
    print(f"Epochs: {config['epochs']}, LR: {config['lr']}, MaxLen: {config['max_len']}")
    print(f"{'='*60}\n")
    
    gc.collect()
    torch.cuda.empty_cache()
    
    # Load data
    train_df = load_data(use_bidir=config['bidir'], use_extra=config['extra'])
    dataset = Dataset.from_pandas(train_df)
    split = dataset.train_test_split(test_size=0.1, seed=42)
    
    # Load model
    tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')
    model = AutoModelForSeq2SeqLM.from_pretrained('google/byt5-small')
    
    # Tokenize
    def preprocess(examples):
        inputs = [str(x) for x in examples['input_text']]
        targets = [str(x) for x in examples['target_text']]
        model_inputs = tokenizer(inputs, max_length=config['max_len'], truncation=True)
        labels = tokenizer(targets, max_length=config['max_len'], truncation=True)
        model_inputs['labels'] = labels['input_ids']
        return model_inputs
    
    train_tok = split['train'].map(preprocess, batched=True, remove_columns=split['train'].column_names)
    val_tok = split['test'].map(preprocess, batched=True, remove_columns=split['test'].column_names)
    
    # Metrics
    metric_chrf = evaluate.load('chrf')
    metric_bleu = evaluate.load('sacrebleu')
    
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple): preds = preds[0]
        if hasattr(preds, 'ndim') and preds.ndim == 3:
            preds = np.argmax(preds, axis=-1)
        preds = np.clip(preds.astype(np.int64), 0, tokenizer.vocab_size - 1)
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        chrf = metric_chrf.compute(predictions=decoded_preds, references=decoded_labels)['score']
        bleu = metric_bleu.compute(predictions=decoded_preds, references=[[x] for x in decoded_labels])['score']
        return {'chrf': chrf, 'bleu': bleu, 'geo_mean': (chrf * bleu) ** 0.5 if chrf > 0 and bleu > 0 else 0.0}
    
    # Training
    output_dir = f"./models/{config['name']}"
    args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        eval_strategy='epoch',
        save_strategy='epoch',
        learning_rate=config['lr'],
        optim='adafactor',
        label_smoothing_factor=config['label_smooth'],
        fp16=False,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=8,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=config['epochs'],
        predict_with_generate=True,
        logging_steps=50,
        report_to='none',
        load_best_model_at_end=True,
        metric_for_best_model='geo_mean',
        greater_is_better=True,
    )
    
    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")
    
    del model, trainer
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
# ========== Train Selected Models ==========
for model_id in MODEL_IDS:
    train_model(CONFIGS[model_id])

print("\n" + "="*60)
print("Training complete!")
print("="*60)

In [None]:
# Save models as Kaggle dataset
!ls -la ./models/