# Deep Past Challenge - Submission Notebook (ByT5)

This notebook loads the trained ByT5 model and generates predictions.

**Important:** Uses the same preprocessing as training (ASCII → Diacritics)

In [1]:
# ===========================================
# SETUP
# ===========================================

import warnings
warnings.simplefilter("ignore")

import torch
import pandas as pd
import re
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, T5ForConditionalGeneration

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

2026-01-14 11:28:53.908446: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768390134.302985      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768390134.399468      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768390135.257141      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768390135.257189      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768390135.257192      55 computation_placer.cc:177] computation placer alr

Device: cuda


In [2]:
# ===========================================
# SETTINGS
# ===========================================

MODEL_PATH = "/kaggle/input/keshbyt5/pytorch/small_oldasyrian_finetune/1/byt5_akkadian_small_final"
LEXICON_PATH = "/kaggle/input/deep-past-initiative-machine-translation/OA_Lexicon_eBL.csv"

# Must match training!
USE_LEXICON = True

MAX_SOURCE_LEN = 1024
MAX_TARGET_LEN = 1024
BEAM_WIDTH = 4
REP_PENALTY = 1.5
NO_REPEAT_NGRAM = 0
BATCH_SIZE = 4
PREFIX = "translate Akkadian to English: "

In [3]:
# ===========================================
# LEXICON
# ===========================================

PN_NAMES = set()
GN_NAMES = set()

if USE_LEXICON:
    lex = pd.read_csv(LEXICON_PATH)
    PN_NAMES = set(lex[lex['type'] == 'PN']['form'].str.lower().dropna())
    GN_NAMES = set(lex[lex['type'] == 'GN']['form'].str.lower().dropna())
    print(f"Lexicon: {len(PN_NAMES)} PNs, {len(GN_NAMES)} GNs")
else:
    print("Lexicon disabled")

Lexicon: 13046 PNs, 328 GNs


In [4]:
# ===========================================
# PREPROCESSING (must match training!)
# ===========================================

ASCII_TO_DIACRITIC = {
    "sz": "š", "SZ": "Š", "Sz": "Š", "sh": "š", "SH": "Š", "Sh": "Š",
    "s,": "ṣ", "S,": "Ṣ", "t,": "ṭ", "T,": "Ṭ", "z,": "ẓ", "Z,": "Ẓ",
    ".s": "ṣ", ".S": "Ṣ", ".t": "ṭ", ".T": "Ṭ", ".z": "ẓ", ".Z": "Ẓ",
    "h,": "ḫ", "H,": "Ḫ", ".h": "ḫ", ".H": "Ḫ", "hh": "ḫ", "HH": "Ḫ",
    "s2": "š", "S2": "Š", "s3": "ś", "S3": "Ś",
    "a2": "á", "a3": "à", "e2": "é", "e3": "è",
    "i2": "í", "i3": "ì", "u2": "ú", "u3": "ù",
}

SUBSCRIPTS = {'₀':'0', '₁':'1', '₂':'2', '₃':'3', '₄':'4',
              '₅':'5', '₆':'6', '₇':'7', '₈':'8', '₉':'9', 'ₓ':'x'}

def normalize_ascii(text):
    if not text:
        return text
    for k, v in sorted(ASCII_TO_DIACRITIC.items(), key=lambda x: -len(x[0])):
        text = text.replace(k, v)
    for k, v in SUBSCRIPTS.items():
        text = text.replace(k, v)
    return text

def normalize_gaps(text):
    if not text:
        return text
    tokens = text.split()
    result = []
    i = 0
    while i < len(tokens):
        if tokens[i].lower() == "x":
            count = 1
            while i + count < len(tokens) and tokens[i + count].lower() == "x":
                count += 1
            result.append("<gap>" if count == 1 else "<big_gap>")
            i += count
        else:
            t = tokens[i]
            if t.lower().startswith("x-"):
                t = "<gap>" + t[1:]
            elif t.lower().endswith("-x"):
                t = t[:-1] + "-<gap>"
            result.append(t)
            i += 1
    text = " ".join(result)
    text = re.sub(r"(<gap>\s*){2,}", "<big_gap> ", text)
    text = re.sub(r"\.\.\.+", " <big_gap> ", text)
    return text.strip()

def tag_names(text):
    if not USE_LEXICON or not text:
        return text
    words = text.split()
    result = []
    for w in words:
        key = w.replace("-", "").lower()
        if key in PN_NAMES:
            result.append(f"[PN]{w}[/PN]")
        elif key in GN_NAMES:
            result.append(f"[GN]{w}[/GN]")
        else:
            result.append(w)
    return " ".join(result)

def clean_akkadian(text):
    if pd.isna(text) or not str(text).strip():
        return ""
    text = str(text)
    text = text.replace("!", "").replace("?", "")
    text = re.sub(r"[˹˺]", "", text)
    text = re.sub(r"\[([^\]]+)\]", r"\1", text)
    text = normalize_ascii(text)
    text = normalize_gaps(text)
    text = tag_names(text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Use this in submission
preprocess = clean_akkadian

In [5]:
# ===========================================
# LOAD MODEL
# ===========================================

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH).to(DEVICE)
model.eval()
print(f"Model loaded: {model.num_parameters()/1e6:.1f}M params")

Model loaded: 299.6M params


In [6]:
# ===========================================
# LOAD TEST DATA
# ===========================================

test_df = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/test.csv")
print(f"Test samples: {len(test_df)}")

# Preprocess
test_df['clean'] = test_df['transliteration'].apply(preprocess)

# Show samples
print("\nPreprocessed samples:")
for i in range(3):
    print(f"  {test_df.iloc[i]['clean'][:60]}...")

Test samples: 4

Preprocessed samples:
  um-ma kà-ru-um kà-ni-ia-ma a-na aa-qí-il… da-tim aí-ip-ri-ni...
  i-na mup-pì-im aa a-lim(ki) ia-tù u„-mì-im a-nim ma-ma-an KÙ...
  ki-ma mup-pì-ni ta-áa-me-a-ni a-ma-kam lu a-na aí-mì-im a-na...


In [7]:
predictions = []

for i in tqdm(range(0, len(test_df), BATCH_SIZE)):
    batch_texts = [PREFIX + t for t in test_df['clean'].iloc[i:i+BATCH_SIZE]]
    batch_lengths = [len(t.encode('utf-8')) for t in test_df['clean'].iloc[i:i+BATCH_SIZE]]
    
    inputs = tokenizer(
        batch_texts, 
        return_tensors='pt', 
        padding=True,
        truncation=True, 
        max_length=MAX_SOURCE_LEN
    ).to(DEVICE)
    
    # Min length = 50% of input length
    min_len = int(min(batch_lengths) * 0.5)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            min_new_tokens=min_len,      # Force minimum output
            num_beams=BEAM_WIDTH,
            repetition_penalty=REP_PENALTY,
            no_repeat_ngram_size=NO_REPEAT_NGRAM,
            length_penalty=1.0,          # Strongly prefer longer
        )
    
    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predictions.extend(preds)

test_df['translation'] = predictions

  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# ===========================================
# SHOW SAMPLES
# ===========================================

print("\nSample predictions:")
for i in range(4):
    print(f"\n{i+1}. Input: {test_df.iloc[i]['clean'][:]}...")
    print(f"   Output: {test_df.iloc[i]['translation'][:]}...")


Sample predictions:

1. Input: um-ma kà-ru-um kà-ni-ia-ma a-na aa-qí-il… da-tim aí-ip-ri-ni kà-ar kà-ar-ma ú wa-bar-ra-tim qí-bi„-ma mup-pu-um aa a-lim(ki) i-li-kam...
   Output: From the Kanesh colony to Aqil-dātum, our messenger, and say: The letter came to the City."...

2. Input: i-na mup-pì-im aa a-lim(ki) ia-tù u„-mì-im a-nim ma-ma-an KÙ.AN i-aa-ú-mu-ni i-na né-mì-lim da-aùr ú-lá e-WA ia-ra-tí-au kà-ru-um kà-ni-ia i-lá-qé...
   Output: In the letter of the City I myself will not delay the gold in the City. This very day none of the silver is not cleared. The Kanesh colony will take the Kanesh colony....

3. Input: ki-ma mup-pì-ni ta-áa-me-a-ni a-ma-kam lu a-na aí-mì-im a-na É.GAL-lim i-dí-in lu té-ra-at É.GAL-lim ú-kà-lim lu na-aí-ma a-dí-ni lá i-dí-in ma-lá KÙ.AN na-áa-ú ni-bi„-it a-aí-im au-um-au ú au-mì a-bi„-au i-na mup-pì-im lu-up-ta-nim-ma ia-tí aí-ip-ri-ni aé-bi„-lá-nim...
   Output: As soon as you heard our letter, he gave them to the palace, be it for the other palace, 

In [9]:
# ===========================================
# SUBMISSION
# ===========================================

submission = test_df[['id', 'translation']]
submission.to_csv('submission.csv', index=False)
print(f"Saved {len(submission)} predictions")
submission.head()

Saved 4 predictions


Unnamed: 0,id,translation
0,0,"From the Kanesh colony to Aqil-dātum, our mess..."
1,1,In the letter of the City I myself will not de...
2,2,"As soon as you heard our letter, he gave them ..."
3,3,I have sent our letter to the kar and wabarrāt...


In [10]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.6.0-py3-none-any.whl.metadata (39 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading sacrebleu-2.6.0-py3-none-any.whl (100 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.2.0 sacrebleu-2.6.0


In [None]:
# ===========================================
# PARAMETER SEARCH (train subset proxy)
# now includes no_repeat_ngram_size + encoder_no_repeat_ngram_size
# ===========================================

import itertools
import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm
import sacrebleu

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------
# Load pretrained checkpoint
# -------------------------
# Set MODEL_DIR to your saved checkpoint folder
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_DIR = MODEL_PATH
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR).to(DEVICE)
model.eval()

# -------------------------
# Data sample (train.csv proxy)
# -------------------------
df = pd.read_csv("/kaggle/input/deep-past-initiative-machine-translation/train.csv")

# preprocess() should exist in your notebook; otherwise define identity:
# def preprocess(x): return x

N_SAMPLES = 20
SEED = 42
BATCH_SIZE = 24

sample = df.sample(N_SAMPLES, random_state=SEED).reset_index(drop=True)
sample["clean"] = sample["transliteration"].apply(preprocess)

# -------------------------
# Parameter grid
# -------------------------
param_grid = {
    "num_beams": [2, 4, 6],
    "length_penalty": [0.8, 0.9, 1.0, 1.1, 1.2],
    "repetition_penalty": [1.0, 1.05, 1.10],
    "no_repeat_ngram_size": [0, 3, 4],
    "encoder_no_repeat_ngram_size": [0, 3],
    "max_new_tokens": [128, 256, 384],
}

keys = list(param_grid.keys())
combinations = list(itertools.product(*[param_grid[k] for k in keys]))
print(f"Testing {len(combinations)} combinations on {len(sample)} samples...")

results = []

@torch.no_grad()
def decode_batch(texts, gen_kwargs):
    inputs = tokenizer(
        [PREFIX + t for t in texts],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_SOURCE_LEN,
    ).to(DEVICE)

    out = model.generate(
        **inputs,
        do_sample=False,
        early_stopping=True if gen_kwargs["num_beams"] > 1 else False,
        **gen_kwargs,
    )
    return tokenizer.batch_decode(out, skip_special_tokens=True)

for combo in tqdm(combinations):
    params = dict(zip(keys, combo))

    # Small guard: constraints are only meaningful with beams
    if params["num_beams"] == 1:
        params["no_repeat_ngram_size"] = 0
        params["encoder_no_repeat_ngram_size"] = 0

    preds, refs = [], []
    for i in range(0, len(sample), BATCH_SIZE):
        batch = sample.iloc[i:i+BATCH_SIZE]
        batch_preds = decode_batch(batch["clean"].tolist(), params)
        preds.extend(batch_preds)
        refs.extend(batch["translation"].tolist())

    bleu = sacrebleu.corpus_bleu(preds, [refs]).score
    chrf = sacrebleu.corpus_chrf(preds, [refs], word_order=2).score
    geo  = float((bleu * chrf) ** 0.5)

    pred_len = np.mean([len(p.split()) for p in preds]) if preds else 0.0
    ref_len  = np.mean([len(r.split()) for r in refs]) if refs else 0.0
    len_ratio = (pred_len / ref_len) if ref_len > 0 else np.nan

    results.append({
        **params,
        "BLEU": bleu,
        "chrF++": chrf,
        "GeoMean": geo,
        "len_ratio": len_ratio,
    })

results_df = (
    pd.DataFrame(results)
      .sort_values(["GeoMean", "BLEU"], ascending=False)
      .reset_index(drop=True)
)

print("\nTop 10:")
display(results_df.head(10))

print("\nBest config:")
best = results_df.iloc[0].to_dict()
best


Testing 810 combinations on 20 samples...


  0%|          | 0/810 [00:00<?, ?it/s]

Testing 5 combinations on 10 samples...


  0%|          | 0/5 [00:00<?, ?it/s]


Top 5:
   beams  rep_pen  no_repeat  len_pen        geo
4      4      1.0          0      2.0  24.610324
3      4      1.0          0      1.8  24.602071
2      4      1.0          0      1.5  24.219573
1      4      1.0          0      1.0  24.110178
0      4      1.0          0      0.8  23.376068
