In [None]:
import os
import logging
from transformers import logging as hf_logging

hf_logging.set_verbosity_error()

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import os
import re
import gc
import glob
import pandas as pd
import numpy as np
import torch
import warnings
from torch.utils.data import Dataset, DataLoader
from transformers import (
    Trainer, 
    TrainingArguments, 
    DataCollatorWithPadding,
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer, 
    EarlyStoppingCallback
)
from sklearn.model_selection import GroupKFold
from difflib import SequenceMatcher

warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
class CONFIG:
    DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation/"
    OUTPUT_DIR = "./results"
    
    MODEL_PATH = "/kaggle/input/nllb-200-distilled-600m"
    
    if not os.path.exists(os.path.join(MODEL_PATH, "config.json")):
        potential_paths = glob.glob(f"{MODEL_PATH}/**/config.json", recursive=True)
        if potential_paths:
            MODEL_PATH = os.path.dirname(potential_paths[0])
            print(f"‚úÖ Found correct model sub-folder at: {MODEL_PATH}")
    else:
        print(f"‚úÖ Found NLLB Model at: {MODEL_PATH}")
    # HYPERPARAMETERS
    MAX_LEN = 75
    BATCH_SIZE = 2       
    GRAD_ACCUM = 8       
    LEARNING_RATE = 2e-5 
    EPOCHS = 15     
    FOLDS = 5            
    SEED = 42
    OUTPUT_DIR = "./results"

In [None]:
# ENHANCED MINING ENGINE (PDF + DICT + LEXICON)
def mine_enhanced_data():
    mined_data = []
    
    # Dictionary & Lexicon
    print("\nüìö [MINER] Extracting Dictionary/Lexicon vocabulary...")
    try:
        dict_df = pd.read_csv(os.path.join(CONFIG.DATA_DIR, "eBL_Dictionary.csv"))
        lex_df = pd.read_csv(os.path.join(CONFIG.DATA_DIR, "OA_Lexicon_eBL.csv"))
        
        # Word-level mapping from Dictionary
        for _, row in dict_df.iterrows():
            if pd.notna(row['lemma']) and pd.notna(row['meaning']):
                mined_data.append({'transliteration': str(row['lemma']), 'translation': str(row['meaning']), 'source': 'dict'})
        
        # Word-level mapping from Lexicon
        for _, row in lex_df.iterrows():
            if pd.notna(row['transliteration']) and pd.notna(row['translation']):
                mined_data.append({'transliteration': str(row['transliteration']), 'translation': str(row['translation']), 'source': 'lex'})
    except Exception as e:
        print(f"‚ö†Ô∏è Dictionary/Lexicon mining issue: {e}")

    # PDF Scanning
    print("‚õèÔ∏è [MINER] Scanning PDFs for hidden tablet translations...")
    try:
        pub_texts = pd.read_csv(os.path.join(CONFIG.DATA_DIR, "published_texts.csv"))
        publications = pd.read_csv(os.path.join(CONFIG.DATA_DIR, "publications.csv"))
        pdf_map = dict(zip(publications['pdf_name'], publications['page_text']))
        
        targets = pub_texts.dropna(subset=['publication_catalog'])
        for idx, row in targets.iterrows():
            cat_id, translit = str(row['publication_catalog']), row['transliteration']
            for pdf_name, text in pdf_map.items():
                text = str(text)
                if cat_id in text:
                    start = text.find(cat_id)
                    candidate = re.sub(r'[^A-Za-z0-9\s.,]', '', text[start:start+500])
                    candidate = re.sub(r'\s+', ' ', candidate).strip()
                    if len(candidate) > 20 and ("the" in candidate.lower() or "and" in candidate.lower()):
                        mined_data.append({'transliteration': translit, 'translation': candidate, 'source': 'pdf'})
                        break
    except Exception as e:
        print(f"‚ö†Ô∏è PDF mining issue: {e}")

    print(f"‚úÖ [MINER] Success! Extracted {len(mined_data)} extra training items.")
    return pd.DataFrame(mined_data)

In [None]:
# CLEANING & NORMALIZATION
def clean_text(text):
    if not isinstance(text, str): return ""
    # remove punctuation, not the sounds of the language
    text = re.sub(r'[!\?/:.À∫Àπ\[\]]', '', text) 
    text = text.replace('[x]', '<gap>').replace('...', '<big_gap>')
    return re.sub(r'\s+', ' ', text).strip()

In [None]:
def train_models():
    # LOAD DATA
    print("üìÇ [PREP] Loading data...")
    train_df = pd.read_csv(os.path.join(CONFIG.DATA_DIR, "train.csv"))
    
    # RUN MINER & CLEAN
    extra_df = mine_enhanced_data()
    if not extra_df.empty:
        train_df = pd.concat([train_df, extra_df], ignore_index=True)

    # DEDUPLICATION (Stop the model from memorizing duplicates)
    train_df = train_df.drop_duplicates(subset=['transliteration', 'translation'])
    
    print("üßπ [PREP] Cleaning text...")
    train_df['input_text'] = train_df['transliteration'].apply(clean_text)
    train_df['target_text'] = train_df['translation'].apply(clean_text)

     # Remove "Long Junk" that confuses the model
    train_df = train_df[train_df['target_text'].str.split().str.len() < 50]
    
    # SETUP TOKENIZER & FOLDS
    tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_PATH)
    gkf = GroupKFold(n_splits=CONFIG.FOLDS)
    train_df['group_id'] = train_df['oare_id'].fillna(train_df.index.to_series()).astype(str)
    
    saved_model_paths = []
    
    # TRAINING LOOP
    for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, groups=train_df['group_id'])):
        print(f"\nüöÄ [FOLD {fold+1}/{CONFIG.FOLDS}] Training Started...")
        train_sub, val_sub = train_df.iloc[train_idx], train_df.iloc[val_idx]
        
        model = AutoModelForSeq2SeqLM.from_pretrained(CONFIG.MODEL_PATH)

        # TOKENIZATION LOGIC
        def prepare_data(df):
            tokenizer.src_lang = "akk_Latn"
            tokenizer.tgt_lang = "eng_Latn"
            
            # Tokenize Inputs
            model_inputs = tokenizer(df['input_text'].tolist(), max_length=CONFIG.MAX_LEN, 
                                     truncation=True, padding="max_length")
            
            # Tokenize Targets
            with tokenizer.as_target_tokenizer():
                labels = tokenizer(df['target_text'].tolist(), max_length=CONFIG.MAX_LEN, 
                                   truncation=True, padding="max_length")
            
            final_labels = []
            for label_set in labels["input_ids"]:
                new_labels = [(l if l != tokenizer.pad_token_id else -100) for l in label_set]
                final_labels.append(new_labels)
                
            model_inputs["labels"] = final_labels
            return model_inputs

        train_data = prepare_data(train_sub)
        val_data = prepare_data(val_sub)

        class SimpleDataset(Dataset):
            def __init__(self, data): self.data = data
            def __len__(self): return len(self.data["input_ids"])
            def __getitem__(self, i): return {k: torch.tensor(v[i]) for k, v in self.data.items()}

        # TRAINING ARGS
        args = TrainingArguments(
            output_dir=f"{CONFIG.OUTPUT_DIR}/fold{fold}",
            eval_strategy="epoch",
            save_strategy="no",
            learning_rate=CONFIG.LEARNING_RATE,
            per_device_train_batch_size=CONFIG.BATCH_SIZE,
            gradient_accumulation_steps=CONFIG.GRAD_ACCUM,
            num_train_epochs=CONFIG.EPOCHS,
            weight_decay=0.1,
            save_total_limit=1,
            load_best_model_at_end=False,
            metric_for_best_model="loss",
            fp16=torch.cuda.is_available(),
            remove_unused_columns=False,
            report_to="none"
        )

        # TRAINER
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=SimpleDataset(train_data),
            eval_dataset=SimpleDataset(val_data),
            data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
        )

        trainer.train()
        
        # 8. SAVE & CLEANUP
        save_path = f"./models/fold{fold}"
        os.makedirs(save_path, exist_ok=True)
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        saved_model_paths.append(save_path)

        # GPU RAM Management
        del model, trainer
        torch.cuda.empty_cache()
        gc.collect()

    return saved_model_paths, train_df

In [None]:
# HYBRID INFERENCE (SEARCH + AI)
def hybrid_predict(model_paths, memory_df):
    print("\nüîÆ [HYBRID] Starting Inference...")
    test_df = pd.read_csv(os.path.join(CONFIG.DATA_DIR, "test.csv"))

    # Filter Memory Map
    memory_df = memory_df[~memory_df['translation'].str.contains("Kanesh|dagger|testimony", case=False)]
    memory_map = dict(zip(memory_df['input_text'], memory_df['translation']))
    
    # Load BEST Model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModelForSeq2SeqLM.from_pretrained(model_paths[0]).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_paths[0])
    
    final_preds = []
    
    # Prediction Loop
    for idx, row in test_df.iterrows():
        clean_in = clean_text(row['transliteration'])
        
        # EXACT SEARCH
        if clean_in in memory_map:
            final_preds.append(memory_map[clean_in])
            continue
            
        # AI PREDICTION (NLLB)
        inputs = tokenizer(clean_in, return_tensors="pt", padding=True).to(device)
        forced_bos = tokenizer.convert_tokens_to_ids("eng_Latn")
        
        with torch.no_grad():
            gen = model.generate(
                **inputs, 
                forced_bos_token_id=forced_bos, 
                max_length=CONFIG.MAX_LEN,           
                num_beams=2,            
                repetition_penalty=3.5,  
                length_penalty=1.0
            )
            
        pred = tokenizer.decode(gen[0], skip_special_tokens=True)
        final_preds.append(pred)
    
    # Save
    sub = pd.DataFrame({'id': test_df['id'], 'translation': final_preds})
    sub.to_csv('submission.csv', index=False)
    print("‚úÖ submission.csv saved successfully!")

In [None]:
# RUNNER
if __name__ == "__main__":
    if torch.cuda.is_available():
        print("üöÄ GPU Detected. Starting Pipeline...")
        # Train models and get data for memory
        paths, memory_data = train_models()
        # Run Hybrid Prediction
        hybrid_predict(paths, memory_data)
    else:
        print("‚ùå Error: No GPU. Go to Settings -> Accelerator -> GPU P100.")

In [None]:
model_path = "./models/fold0" 
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained(model_path, src_lang="akk_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
model.eval() 

test_sentence = "awƒ´lum damiq"

# Tokenize with clear Target Language
inputs = tokenizer(test_sentence, return_tensors="pt").to(device)
forced_bos = tokenizer.convert_tokens_to_ids("eng_Latn")

# Optimized Generation Settings
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        forced_bos_token_id=forced_bos, 
        
        # STOP THE HALLUCINATION 
        max_new_tokens=25,      
        num_beams=2,             
        repetition_penalty=3.5,  
        length_penalty=0.8,      
        no_repeat_ngram_size=2,  
        early_stopping=True
    )

translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"\nüìú Akkadian: {test_sentence}")
print(f"üåç English AI: {translation}")