In [None]:
! pip install evaluate sacrebleu

In [None]:
# === Imports & Global Config (Consolidated) ===
import os, json, re, unicodedata, math, copy, shutil
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from tqdm.auto import tqdm
import evaluate
import nltk
import sacrebleu  # optional direct usage if needed
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, get_scheduler

# Download required NLTK data (safe to call multiple times; will skip if already present)
for pkg in ['punkt', 'wordnet', 'omw-1.4']:
    try:
        nltk.download(pkg, quiet=True)
    except Exception as e:
        print(f"Warning: failed to download {pkg}: {e}")

# Dataset path (define once here)
data_path = '../../data/paralel_cub_200_2011_captions_final-curated.json'
if os.path.exists(data_path):
    print(f"File found at: {data_path}")
else:
    print(f"File not found at: {data_path}")

# All downstream cells rely on the imports & data_path defined here.

# Load data

In [None]:
# Text preprocessing utilities

def preprocess_text(text: str) -> str:
    """Normalize and lightly clean text.
    Steps:
    1. Unicode normalize (NFKC)
    2. Lowercase
    3. Remove URLs
    4. Remove unwanted chars (keep alnum + basic punct)
    5. Collapse whitespace
    6. Replace artifact '1212'
    """
    text = unicodedata.normalize("NFKC", text)
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s.,!?'-]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = text.replace("1212", " ")
    return text

In [None]:
# Load JSON file

data_path = '../../data/paralel_cub_200_2011_captions_final-curated.json'
with open(data_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Flatten the pairs
pairs = []
for item in data:
    for caption in item["captions"]:
        en = preprocess_text(caption["english"])
        idn = preprocess_text(caption["indo"])
        pairs.append((en, idn))

print(pairs[:3])

In [None]:
print(f"Total data: {len(pairs)}")

In [None]:
# Remove empty or very short sentences
pairs = [(en, idn) for en, idn in pairs if len(en.split()) > 5 and len(idn.split()) > 5]

# Deduplicate
pairs = list(set(pairs))

print(f"Total data setelah menghapus data duplikat: {len(pairs)}")

# Take only 0.1% of data for testing
# num_samples = int(len(pairs) * 0.001)  # 0.1% dari total data
# pairs = pairs[:num_samples]

# print(f"Total data setelah menghapus data duplikat: {len(pairs)}")
# print(f"Menggunakan {num_samples} sampel (0.1% dari total data)")

In [None]:
error = "1212"
error_total = 0
for (en, idn) in pairs:
    if en.__contains__(error) or idn.__contains__(error):
        error_total += 1
print(f"data error total: {error_total}")

# Train test split

In [None]:
from sklearn.model_selection import train_test_split

train_pairs, test_pairs = train_test_split(pairs, test_size=0.1, random_state=42)
train_pairs, val_pairs  = train_test_split(train_pairs, test_size=0.1111, random_state=42)

print(f"Train size: {len(train_pairs)}")
print(f"Val size: {len(val_pairs)}")
print(f"Test size: {len(test_pairs)}")

# Load model and tokenizer

In [None]:
# Load tokenizer & base model (MarianMT - en->id)
model_name = "Helsinki-NLP/opus-mt-en-id"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, use_safetensors=True)

In [None]:
from torch.utils.data import Dataset

class TranslationDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_length=384):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]

        model_inputs = self.tokenizer(
            src,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            text_target=tgt
        )

        return {k: torch.tensor(v) for k, v in model_inputs.items()}

In [None]:
from torch.utils.data import DataLoader

train_dataset = TranslationDataset(train_pairs, tokenizer)
val_dataset   = TranslationDataset(val_pairs, tokenizer)
test_dataset  = TranslationDataset(test_pairs, tokenizer)


train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=4)
test_loader  = DataLoader(test_dataset, batch_size=4)

# Grid Search Configuration

In [None]:
# Hyperparameter grid (focused)
param_grid = {
    'learning_rate': [2e-5, 3e-5],
    'batch_size': [8],        # fixed for stability
    'num_epochs': [3, 5],
    'warmup_ratio': [0.1],
    'weight_decay': [0.01]
}

grid_search_results = []
best_bleu = 0.0
best_model = None
best_params = None

# Dataloaders with fixed batch size (will be re-created inside loop for each combo)
train_loader = DataLoader(train_dataset, batch_size=param_grid['batch_size'][0], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=param_grid['batch_size'][0])

print("Grid Search Configuration:")
print("-------------------------")
for param, values in param_grid.items():
    print(f"{param}: {values}")

total_combinations = len(param_grid['learning_rate']) * len(param_grid['num_epochs'])
print(f"\nTotal kombinasi yang akan dicoba: {total_combinations}")
print("\nKonfigurasi tetap:")
print(f"- Batch size: {param_grid['batch_size'][0]}")
print(f"- Weight decay: {param_grid['weight_decay'][0]}")
print(f"- Warmup ratio: {param_grid['warmup_ratio'][0]}")

In [None]:
# TRAINING UTILITIES, LAYER FREEZING, DAN GRID SEARCH LOOP (VERSI FINAL & AMAN)

# Inisialisasi Accelerator untuk distributed training dan mixed precision
accelerator = Accelerator()
device = accelerator.device
print("Using device:", device)

# Load metric evaluasi yang akan digunakan untuk mengevaluasi kualitas translation
bleu_metric = evaluate.load("sacrebleu")
meteor_metric = evaluate.load("meteor") 

# Setup direktori untuk menyimpan checkpoint dan best model
root_dir = "./gridsearch_opus_manual_experiment_2"
checkpoint_root = os.path.join(root_dir, "checkpoints")
best_model_dir = os.path.join(root_dir, "best_model")
os.makedirs(checkpoint_root, exist_ok=True)
os.makedirs(best_model_dir, exist_ok=True)

# KONFIGURASI RETENSI CHECKPOINT GLOBAL
# - RESUME_ENABLED: aktifkan resume dari checkpoint terakhir untuk kombinasi yang sama
# - STRICT_EPOCH_MATCH: hanya resume jika jumlah epoch yang dikonfigurasi sama
# - KEEP_LAST_N_CKPT: berapa banyak checkpoint terakhir yang disimpan (None = simpan semua)
RESUME_ENABLED = True
STRICT_EPOCH_MATCH = True
# Disable automatic checkpoint deletion by default (keep all checkpoints)
KEEP_LAST_N_CKPT = None  # Set to None to disable retention pruning
# =======================================================================

# File untuk persist best BLEU antar run (agar best model tetap tersimpan meskipun kernel restart)
best_state_file = os.path.join(root_dir, 'best_global.json')

# Variabel Global untuk menahan direktori kombinasi yang AKAN dihapus (ditunda)
LAST_SUCCESSFUL_COMBO_DIR = None 


# === Definisi Fungsi Pembantu ===

def fmt_lr(lr: float) -> str:
    """Format learning rate menjadi string ringkas untuk digunakan dalam penamaan folder checkpoint.
    - Jika lr < 0.001 => gunakan format scientific (e.g., 2e-05)
    - Sebaliknya => gunakan format decimal, hapus trailing zeros
    """
    return f"{lr:.0e}" if lr < 1e-3 else (f"{lr:.6f}".rstrip('0').rstrip('.'))

def extract_epoch(name):
    """Ekstrak nomor epoch dari nama folder checkpoint (format: ...ep{N}_...).
    Kembalikan nomor epoch atau -1 jika tidak ditemukan (untuk sorting yang aman).
    """
    try:
        for p in name.split('_'):
            if p.startswith('ep') and p[2:].isdigit():
                return int(p[2:])
    except Exception:
        return -1
    return -1


def save_checkpoint(model, optimizer, epoch, lr, batch_size, weight_decay, num_epochs_cfg, train_loss, val_loss, bleu, meteor, history, patience_counter, ckpt_dir):
    """Simpan snapshot training (state dict + optimizer + metadata) ke folder checkpoint.
    
    File akan disimpan sebagai {ckpt_dir}/{dir_name}/checkpoint.pt dengan struktur folder:
    - Nama folder menyertakan hyperparameter (lr, batch_size, weight_decay, epoch, metrik)
    - Isi checkpoint.pt adalah dict berisi model state, optimizer state, dan training info
    """
    # Unwrap model jika menggunakan distributed training (accelerator)
    unwrapped = accelerator.unwrap_model(model)
    lr_tag = fmt_lr(lr)
    
    # Buat nama folder yang informatif dengan ringkasan hyperparameter dan metrik
    dir_name = (f"ckpt_lr{lr_tag}_bs{batch_size}_wd{weight_decay}_maxep{num_epochs_cfg}_ep{epoch}"
                f"_tl{train_loss:.4f}_vl{val_loss:.4f}_bleu{bleu:.2f}_meteor{meteor:.2f}")
    full = os.path.join(ckpt_dir, dir_name)
    os.makedirs(full, exist_ok=True)
    
    # Buat dict state yang berisi semua informasi penting untuk resume training
    state = {
        'epoch': epoch,
        'model_state_dict': unwrapped.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'learning_rate': lr,
        'batch_size': batch_size,
        'weight_decay': weight_decay,
        'configured_num_epochs': num_epochs_cfg,
        'train_loss': train_loss,
        'val_loss': val_loss,
        'bleu': bleu,
        'meteor': meteor,
        'history': history,  # Menyimpan history losses dan scores untuk epoch-epoch sebelumnya
        'patience_counter': patience_counter  # Untuk melanjutkan early stopping counter
    }
    
    # Simpan ke file
    torch.save(state, os.path.join(full, 'checkpoint.pt'))
    print(f"Saved checkpoint: {full}")

# Ganti fungsi load_latest_checkpoint di Cell 11 Anda dengan kode ini:

def load_latest_checkpoint(model, optimizer, ckpt_dir, lr, batch_size, weight_decay, num_epochs_cfg):
    """Cari dan muat checkpoint terakhir yang kompatibel untuk kombinasi hyperparameter tertentu.
    
    Logika:
    - Jika resume dinonaktifkan atau folder tidak ada => return None
    - Bentuk prefix nama folder berdasarkan hyperparameter dan STRICT_EPOCH_MATCH flag
    - Cari semua checkpoint yang cocok, urutkan berdasarkan epoch (terbaru duluan)
    - Coba muat satu per satu sampai berhasil atau semua gagal
    
    Return:
    - Dict checkpoint jika berhasil dimuat, None jika tidak ada atau gagal semua
    """
    # Jika resume tidak diaktifkan atau direktori tidak ada, tidak perlu melanjutkan
    if not RESUME_ENABLED or not os.path.isdir(ckpt_dir):
        return None
    
    # Bentuk prefix nama folder checkpoint yang akan dicari
    lr_tag = fmt_lr(lr)
    prefix = f"ckpt_lr{lr_tag}_bs{batch_size}_wd{weight_decay}_maxep{num_epochs_cfg}_ep" if STRICT_EPOCH_MATCH else f"ckpt_lr{lr_tag}_bs{batch_size}_wd{weight_decay}_"
    
    # Cari semua folder yang sesuai dengan prefix
    candidates = [d for d in os.listdir(ckpt_dir) if d.startswith(prefix)]
    if not candidates:
        return None
    
    # Urutkan berdasarkan epoch (terbaru di depan) untuk mencoba yang terakhir dulu
    candidates.sort(key=lambda x: extract_epoch(x), reverse=True)
    
    # Coba muat checkpoint satu per satu
    for cand in candidates:
        path = os.path.join(ckpt_dir, cand, 'checkpoint.pt')
        try:
            # === PERBAIKAN: Set weights_only=False untuk kompatibilitas mundur ===
            # Ini mengizinkan PyTorch memuat file lama (PyTorch 2.0+ lebih strict dengan security)
            ckpt = torch.load(path, map_location='cpu', weights_only=False)
            
            # Jika STRICT_EPOCH_MATCH aktif, pastikan num_epochs yang dikonfigurasi sama
            if STRICT_EPOCH_MATCH and ckpt.get('configured_num_epochs') != num_epochs_cfg:
                continue
            
            # Muat state dict ke model dan optimizer
            model.load_state_dict(ckpt['model_state_dict'])
            optimizer.load_state_dict(ckpt['optimizer_state_dict'])
            print(f"Resumed from checkpoint: {cand}")
            return ckpt
        except Exception as e:
            # Tampilkan error message yang informatif untuk debugging
            if "Unsupported global" in str(e) or "WeightsUnpickler error" in str(e):
                print(f"Failed to load {cand} due to PyTorch safety change. Skipping resume for this checkpoint.")
            else:
                print(f"Failed to load {cand}: {e}")
            # Lanjutkan ke checkpoint berikutnya
            continue
    
    # Jika semua checkpoint gagal dimuat, return None (mulai dari awal)
    return None

def save_best_model(model, tokenizer, params, metrics, save_dir):
    """Simpan snapshot model terbaik yang dapat dimuat kembali menggunakan AutoModelForSeq2SeqLM.
    
    Folder akan berisi:
    - Model weights dan config (save_pretrained)
    - Tokenizer files
    - best_params.json berisi hyperparameter dan metric saat saving
    
    Catatan: tidak menghapus folder best model sebelumnya, setiap best baru disimpan terpisah
    """
    # Unwrap model jika menggunakan distributed training
    unwrapped = accelerator.unwrap_model(model)
    lr_tag = fmt_lr(params['learning_rate'])
    
    # Buat nama folder yang informatif dengan parameter dan metrik
    dir_name = (f"best_lr{lr_tag}_bs{params['batch_size']}_wd{params['weight_decay']}_maxep{params['num_epochs']}_ep{params['epoch']}"
                f"_tl{metrics['train_loss']:.4f}_vl{metrics['val_loss']:.4f}_bleu{metrics['bleu']:.2f}_meteor{metrics['meteor']:.2f}")
    dest = os.path.join(save_dir, dir_name)
    os.makedirs(dest, exist_ok=True)
    
    # Simpan model dan tokenizer menggunakan save_pretrained (format HuggingFace standar)
    unwrapped.save_pretrained(dest)
    tokenizer.save_pretrained(dest)
    
    # Simpan metadata (hyperparameter dan metrik) ke JSON untuk referensi
    with open(os.path.join(dest, 'best_params.json'), 'w') as f:
        json.dump({'hyperparameters': params, 'metrics': metrics}, f, indent=4)
    print(f"New best model saved to: {dest}")
# =========================================================================

# === Early Stopping Config (VAL LOSS based) ===
# Konfigurasi untuk early stopping yang mencegah overfitting
# - patience = berapa banyak epoch yang toleransi tanpa improvement sebelum berhenti
# - min_val_loss_improvement = threshold minimum untuk dihitung sebagai improvement
early_stopping_patience = 2              
min_val_loss_improvement = 0.0002         
min_bleu_improvement = 0.2               

# Variabel untuk menyimpan hasil grid search
bleu_scores = {}
training_results = []

# Coba load best BLEU dari file jika ada (dari run sebelumnya)
if os.path.exists(best_state_file):
    try:
        with open(best_state_file, 'r') as f:
            saved_best = json.load(f)
        best_bleu = saved_best.get('best_bleu', 0.0)
        best_params = saved_best.get('best_params', {})
        print(f"Loaded persisted best_bleu={best_bleu:.2f}")
    except Exception as e:
        print(f"Failed to load persisted best: {e}")
        if 'best_bleu' not in globals():
            best_bleu = 0.0
        if 'best_params' not in globals():
            best_params = {}
else:
    # Jika file tidak ada, mulai dari nol
    if 'best_bleu' not in globals():
        best_bleu = 0.0
    if 'best_params' not in globals():
        best_params = {}

if 'best_model' not in globals():
    best_model = None

# === Layer Freezing & Gradient Accumulation Configuration ===
# Layer freezing: membekukan layer yang tidak perlu dilatih untuk menghemat GPU memory dan waktu
# - NUM_ENCODER_LAYERS_TO_TRAIN: berapa layer encoder bagian atas yang ingin dilatih
# - NUM_DECODER_LAYERS_TO_TRAIN: berapa layer decoder bagian atas yang ingin dilatih
# Gradient accumulation: mengakumulasi gradient beberapa step sebelum update parameter
# - Tujuan: efektif batch size lebih besar tanpa butuh GPU memory lebih banyak
NUM_ENCODER_LAYERS_TO_TRAIN = 2
NUM_DECODER_LAYERS_TO_TRAIN = 6
gradient_accumulation_steps = 2 # EFEKTIF BATCH SIZE = batch_size * 2

# ============================================================
# MULAI LOOP GRID SEARCH UNTUK MENCOBA BERBAGAI KOMBINASI HYPERPARAMETER
# ============================================================
for lr in param_grid['learning_rate']:
    for batch_size in param_grid['batch_size']:
        for num_epochs in param_grid['num_epochs']:
            for weight_decay in param_grid['weight_decay']:
                
                # Simpan referensi folder kombinasi sebelumnya (untuk cleanup opsional)
                folder_to_delete_on_success = LAST_SUCCESSFUL_COMBO_DIR
                
                # Reset pointer global sebelum memulai iterasi kombinasi baru
                LAST_SUCCESSFUL_COMBO_DIR = None
                
                # Format learning rate menjadi tag yang bisa digunakan di nama folder
                lr_tag = fmt_lr(lr)
                print(f"\n=== Training: lr={lr} (tag {lr_tag}), batch_size={batch_size}, epochs={num_epochs}, weight_decay={weight_decay} ===")
                print(f"   >>> Gradient Accumulation Steps: {gradient_accumulation_steps} (Effective BS: {batch_size * gradient_accumulation_steps})")
                
                # === INISIALISASI MODEL ===
                # Muat model dasar dari HuggingFace (model akan diinisiasi ulang per kombinasi hyperparameter)
                model = AutoModelForSeq2SeqLM.from_pretrained(model_name, use_safetensors=True)
                

                # === Layer Freezing Implementation (Selective) ===
                # Strategi: bekukan layer bawah (general feature), latih layer atas (task-specific)
                # OPUS-MT (MarianMT) memiliki struktur: model.model.encoder.layers dan model.model.decoder.layers
                
                # 1. Membekukan Shared Embedding Layer
                # Embedding layer memetakan token ID menjadi vector, kurang perlu disesuaikan
                if hasattr(model, 'model') and hasattr(model.model, 'shared') and model.model.shared is not None:
                    for param in model.model.shared.parameters():
                        param.requires_grad = False
                    print("Shared Embedding Layer frozen.")

                # 2. Membekukan Layer Encoder Bawah
                # Encoder bawah berisi feature general (syntax, morphology), kurang spesifik untuk task
                # OPUS-MT: akses layers melalui model.model.encoder.layers (bukan model.encoder.block)
                if hasattr(model, 'model') and hasattr(model.model, 'encoder') and hasattr(model.model.encoder, 'layers'):
                    encoder_layers = model.model.encoder.layers
                    num_encoder_layers = len(encoder_layers)
                    layers_to_freeze_enc = num_encoder_layers - NUM_ENCODER_LAYERS_TO_TRAIN
                    # Loop untuk membekukan layer-layer bawah
                    for i in range(layers_to_freeze_enc):
                        for param in encoder_layers[i].parameters():
                            param.requires_grad = False
                    print(f"Encoder: {layers_to_freeze_enc} bottom layers frozen, {NUM_ENCODER_LAYERS_TO_TRAIN} top layers trainable.")

                # 3. Membekukan Layer Decoder Bawah
                # Decoder bawah juga mengandung feature general, layer atas lebih specific
                # OPUS-MT: akses layers melalui model.model.decoder.layers (bukan model.decoder.block)
                if hasattr(model, 'model') and hasattr(model.model, 'decoder') and hasattr(model.model.decoder, 'layers'):
                    decoder_layers = model.model.decoder.layers
                    num_decoder_layers = len(decoder_layers)
                    layers_to_freeze_dec = num_decoder_layers - NUM_DECODER_LAYERS_TO_TRAIN
                    # Loop untuk membekukan layer-layer bawah
                    for i in range(layers_to_freeze_dec):
                        for param in decoder_layers[i].parameters():
                            param.requires_grad = False
                    print(f"Decoder: {layers_to_freeze_dec} bottom layers frozen, {NUM_DECODER_LAYERS_TO_TRAIN} top layers trainable.")
                # =======================================

                # === INISIALISASI OPTIMIZER DAN SCHEDULER ===
                # Optimizer sekarang hanya akan menginisiasi parameter yang requires_grad=True (yang tidak dibekukan)
                optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
                
                # Buat dataloader baru untuk kombinasi batch_size ini
                train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                val_loader = DataLoader(val_dataset, batch_size=batch_size)
                
                # Hitung jumlah total training steps untuk scheduler
                num_training_steps = num_epochs * len(train_loader)
                num_warmup_steps = int(param_grid['warmup_ratio'][0] * num_training_steps)
                
                # Learning rate scheduler: linear dari LR awal, turun ke 0 setelah warmup
                scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
                
                # Prepare model, optimizer, dataloader dengan accelerator untuk distributed training
                model, optimizer, train_loader, val_loader, scheduler = accelerator.prepare(model, optimizer, train_loader, val_loader, scheduler)
                
                # === SETUP CHECKPOINT DIRECTORY ===
                # Buat folder checkpoint untuk kombinasi hyperparameter ini
                combo_ckpt_dir = os.path.join(checkpoint_root, f"lr{lr_tag}_bs{batch_size}_wd{weight_decay}_maxep{num_epochs}")
                os.makedirs(combo_ckpt_dir, exist_ok=True)
                print(f"Checkpoint directory kombinasi: {combo_ckpt_dir}")
                
                # === COBA RESUME DARI CHECKPOINT ===
                # Load checkpoint terakhir jika ada dan kompatibel
                loaded_ckpt = load_latest_checkpoint(model, optimizer, combo_ckpt_dir, lr, batch_size, weight_decay, num_epochs)
                
                # --- PENGATURAN STARTING EPOCH ---
                # Tentukan epoch berapa untuk mulai training (resume atau mulai dari 0)
                if loaded_ckpt:
                    starting_epoch = loaded_ckpt['epoch']
                else:
                    starting_epoch = 0
                
                # Jika semua epoch sudah selesai sebelumnya, skip kombinasi ini
                if starting_epoch >= num_epochs:
                    print(f"Semua epoch ({num_epochs}) telah selesai sebelumnya. Melewati kombinasi ini.")
                    # Pindahkan folder ini ke antrian global (untuk dihapus di iterasi berikutnya)
                    LAST_SUCCESSFUL_COMBO_DIR = combo_ckpt_dir
                    continue # Lanjut ke kombinasi berikutnya
                
                # === INISIALISASI HISTORY DAN METRICS ===
                # Initialize history dan best metrics untuk kombinasi ini
                # Jika ada checkpoint, restore history; jika tidak, mulai dari kosong
                history = loaded_ckpt.get('history', {
                    'learning_rate': lr, 'batch_size': batch_size, 'weight_decay': weight_decay, 'num_epochs': num_epochs,
                    'train_losses': [], 'val_losses': [], 'bleu_scores': [], 'meteor_scores': []
                }) if loaded_ckpt else {
                    'learning_rate': lr, 'batch_size': batch_size, 'weight_decay': weight_decay, 'num_epochs': num_epochs,
                    'train_losses': [], 'val_losses': [], 'bleu_scores': [], 'meteor_scores': []
                }
                
                # Restore patience counter untuk early stopping
                patience_counter = loaded_ckpt.get('patience_counter', 0) if loaded_ckpt else 0
                
                # Restore best BLEU untuk kombinasi ini dari history (jika ada)
                best_combo_bleu = max(history['bleu_scores']) if history['bleu_scores'] else loaded_ckpt.get('bleu', 0.0) if loaded_ckpt else 0.0
                
                # Restore best validation loss untuk early stopping
                best_val_loss = min(history['val_losses']) if history['val_losses'] else float('inf')

                best_state_dict = None
                
                # ============================================================
                # LOOP EPOCH UNTUK TRAINING KOMBINASI INI
                # ============================================================
                for epoch in range(starting_epoch, num_epochs):
                    # === TRAINING PHASE ===
                    model.train()  # Set model ke training mode (enable dropout, batchnorm)
                    total_loss = 0.0
                    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
                    
                    # Kosongkan cache GPU agar tersedia untuk batch berikutnya
                    torch.cuda.empty_cache() 
                    
                    for step, batch in enumerate(progress_bar):
                        
                        # Tentukan apakah step ini final step (untuk gradient accumulation)
                        # Jika step ke-(gradient_accumulation_steps) atau step terakhir => lakukan update
                        is_final_step = (step + 1) % gradient_accumulation_steps == 0 or step == len(train_loader) - 1
                        
                        # Forward pass: hitung loss
                        outputs = model(**batch)
                        loss = outputs.loss
                        
                        # === Gradient Accumulation Logic ===
                        # Bagi loss dengan gradient_accumulation_steps agar gradient tidak terlalu besar
                        loss_for_backward = loss / gradient_accumulation_steps
                        # Backward pass dengan accelerator (handle distributed training)
                        accelerator.backward(loss_for_backward)
                        
                        # Tambahkan loss batch ini ke total (catat loss full, bukan loss yang dibagi)
                        total_loss += loss.item() 
                        
                        # Jika sudah waktu, lakukan optimizer step dan reset gradient
                        if is_final_step:
                            optimizer.step()  # Update parameter berdasarkan accumulated gradient
                            scheduler.step()  # Update learning rate
                            optimizer.zero_grad()  # Reset gradient untuk batch berikutnya
                        
                        # Update progress bar dengan loss dan effective batch size
                        progress_bar.set_postfix(loss=f"{loss.item():.4f}", eff_bs=f"{batch_size * gradient_accumulation_steps}")

                    # Hitung rata-rata training loss untuk epoch ini
                    avg_train_loss = total_loss / max(1, len(train_loader))
                    
                    # === VALIDATION PHASE ===
                    model.eval()  # Set model ke evaluation mode (disable dropout, batchnorm)
                    val_loss = 0.0
                    all_preds, all_labels = [], []  # Kumpulkan semua prediksi dan label untuk metric
                    
                    with torch.no_grad():  # Jangan hitung gradient di validation
                        for batch in val_loader:
                            # Forward pass (no loss untuk eval, tapi kita hitung juga untuk monitoring)
                            outputs = model(**batch)
                            v_loss = outputs.loss
                            val_loss += v_loss.item()
                            
                            # Generate translation (greedy decode)
                            gen_tokens = model.generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], max_length=tokenizer.model_max_length)
                            
                            # Decode generated tokens dan label ke text
                            preds = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
                            labels = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)
                            
                            # Kumpulkan untuk metric computation
                            all_preds.extend(preds)
                            all_labels.extend([[l] for l in labels])  # Wrap dalam list untuk format metric
                    
                    # Hitung rata-rata validation loss
                    avg_val_loss = val_loss / max(1, len(val_loader))
                    
                    # Hitung BLEU dan METEOR score
                    val_bleu = bleu_metric.compute(predictions=all_preds, references=all_labels)['score']
                    val_meteor = meteor_metric.compute(predictions=all_preds, references=all_labels)['meteor']
                    
                    # Simpan metrics ke history
                    history['train_losses'].append(avg_train_loss)
                    history['val_losses'].append(avg_val_loss)
                    history['bleu_scores'].append(val_bleu)
                    history['meteor_scores'].append(val_meteor)
                    
                    # Print informasi epoch
                    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val BLEU: {val_bleu:.2f} | Val METEOR: {val_meteor:.2f}")

                    # === SIMPAN CHECKPOINT BERKALA ===
                    save_checkpoint(model, optimizer, epoch+1, lr, batch_size, weight_decay, num_epochs, avg_train_loss, avg_val_loss, val_bleu, val_meteor, history, patience_counter, combo_ckpt_dir)

                    # === UPDATE BEST MODEL (PER KOMBINASI DAN GLOBAL) ===
                    # Cek apakah BLEU epoch ini lebih baik dari best BLEU kombinasi ini
                    if val_bleu > best_combo_bleu + 1e-9:
                        # BLEU improvement untuk kombinasi ini
                        improvement_bleu = val_bleu - best_combo_bleu
                        best_combo_bleu = val_bleu
                        # Simpan state dict untuk potential recovery
                        best_state_dict = copy.deepcopy(accelerator.unwrap_model(model).state_dict())
                        
                        # Cek apakah BLEU kombinasi ini lebih baik dari best BLEU global (dari semua run)
                        # Only persist to the GLOBAL best_model_dir if this combo's best exceeds the persisted global best
                        if best_combo_bleu > best_bleu:
                            # Update global best dan simpan model + metadata
                            best_bleu = best_combo_bleu
                            best_model = copy.deepcopy(best_state_dict)
                            # Simpan hyperparameter dan metric untuk best model
                            best_params = {'learning_rate': lr,'batch_size': batch_size,'num_epochs': num_epochs,'weight_decay': weight_decay,'bleu_score': best_combo_bleu,'meteor_score': val_meteor}
                            try:
                                # Simpan model ke disk
                                save_best_model(
                                    model, tokenizer,
                                    params={'learning_rate': lr,'batch_size': batch_size,'num_epochs': num_epochs,'weight_decay': weight_decay,'epoch': epoch+1},
                                    metrics={'train_loss': avg_train_loss,'val_loss': avg_val_loss,'bleu': val_bleu,'meteor': val_meteor},
                                    save_dir=best_model_dir,
                                )
                                # Persist best BLEU ke file JSON
                                with open(best_state_file, 'w') as f:
                                    json.dump({'best_bleu': best_bleu, 'best_params': best_params}, f, indent=4)
                                print(f"Persisted new global best BLEU: {best_bleu:.2f}")
                            except Exception as e:
                                print(f"Failed to persist best BLEU or save model: {e}")

                    # === EARLY STOPPING BERBASIS VAL LOSS ===
                    # Mekanisme early stopping: jika validation loss tidak improve, stop training
                    if best_val_loss == float('inf'):
                        # Epoch pertama: set baseline
                        best_val_loss = avg_val_loss
                        patience_counter = 0
                        print(f"[ValLoss] Baseline set to {best_val_loss:.4f}")
                    else:
                        # Hitung selisih val loss antara best dan current
                        loss_diff = best_val_loss - avg_val_loss
                        
                        if loss_diff >= min_val_loss_improvement:
                            # Improvement signifikan: reset patience
                            best_val_loss = avg_val_loss
                            patience_counter = 0
                            print(f"[ValLoss] Improved by {loss_diff:.4f} (>= {min_val_loss_improvement}) -> reset patience. New best {best_val_loss:.4f}")
                        elif avg_val_loss < best_val_loss:
                            # Improvement minor (di bawah threshold): increment patience
                            best_val_loss = avg_val_loss
                            patience_counter += 1
                            print(f"[ValLoss] Minor improvement {loss_diff:.4f} (< {min_val_loss_improvement}). Patience {patience_counter}/{early_stopping_patience}")
                        else:
                            # Tidak ada improvement: increment patience
                            patience_counter += 1
                            print(f"[ValLoss] No improvement (delta {-loss_diff:.4f}). Patience {patience_counter}/{early_stopping_patience}")

                    # Jika patience exceed threshold dan bukan epoch terakhir, stop training
                    if patience_counter >= early_stopping_patience and epoch < num_epochs - 1:
                        print("Early stopping triggered (val loss stagnation).\n")
                        break
                
                # --- ANTRIAN PEMBERSIHAN: Tandai folder ini untuk dihapus di iterasi berikutnya ---
                # Disabled: keep combo directories intact (opsional untuk cleanup)
                LAST_SUCCESSFUL_COMBO_DIR = combo_ckpt_dir 
                # --- SELESAI ---

                # === SIMPAN HASIL KOMBINASI INI ===
                # Append hasil training kombinasi ke list untuk report final
                training_results.append({
                    'learning_rate': lr,
                    'batch_size': batch_size,
                    'num_epochs': num_epochs,
                    'weight_decay': weight_decay,
                    'best_bleu': best_combo_bleu,
                    'final_bleu': history['bleu_scores'][-1] if history['bleu_scores'] else 0.0,
                    'train_losses': history['train_losses'],
                    'val_losses': history['val_losses'],
                    'bleu_scores': history['bleu_scores'],
                    'meteor_scores': history['meteor_scores'],
                    'best_epoch': (history['bleu_scores'].index(best_combo_bleu)+1) if history['bleu_scores'] else None,
                    'final_train_loss': history['train_losses'][-1] if history['train_losses'] else None,
                    'final_val_loss': history['val_losses'][-1] if history['val_losses'] else None
                })
                # Simpan BLEU score untuk hasil akhir
                bleu_scores[f"lr={lr}, bs={batch_size}, epochs={num_epochs}, wd={weight_decay}"] = best_combo_bleu

# ============================================================
# SELESAI GRID SEARCH - PRINT HASIL AKHIR
# ============================================================
print("\nBLEU Scores for each combination:")
print("--------------------------------")
sorted_scores = sorted(bleu_scores.items(), key=lambda x: x[1], reverse=True)
for params, score in sorted_scores:
    print(f"{params} | BLEU Score: {score:.2f}")

# === LOAD BEST MODEL DARI DISK ===
# Buka folder best_model yang sudah disimpan sebelumnya
best_subdirs = [d for d in os.listdir(best_model_dir) if os.path.isdir(os.path.join(best_model_dir, d))]
best_model_loaded = None
if best_subdirs:
    # Ambil folder pertama (bisa juga ambil yang terbaru atau sesuai kriteria lain)
    selected = best_subdirs[0]
    path = os.path.join(best_model_dir, selected)
    print(f"Best model folder: {path}")
    # Load model dari disk
    best_model_loaded = AutoModelForSeq2SeqLM.from_pretrained(path, use_safetensors=True)
    # Jika ada GPU, pindahkan model ke GPU
    if torch.cuda.is_available():
        best_model_loaded = best_model_loaded.to('cuda')
else:
    print("No best model directory found yet.")

In [None]:
import torch

# Free up GPU memory
torch.cuda.empty_cache()

# Training Results Analysis

In [None]:
# Buat DataFrame untuk hasil training
results_df = pd.DataFrame([
    {
        'Learning Rate': res['learning_rate'],
        'Epochs': res['num_epochs'],
        'Best Epoch': res['best_epoch'],
        'Final Train Loss': res['final_train_loss'],
        'Final Val Loss': res['final_val_loss'],
        'Best BLEU Score': res['best_bleu'],
        'Final BLEU Score': res['final_bleu']
    }
    for res in training_results
])

if results_df.empty:
    print("Tidak ada hasil training.")
else:
    # Sort berdasarkan BLEU Score terbaik
    results_df = results_df.sort_values('Best BLEU Score', ascending=False)

    # Tampilkan tabel
    print("\nTraining Results Summary:")
    print("------------------------")
    display(results_df)

    # Ambil baris best BLEU
    best_row = results_df.iloc[0]

    # Cari dict asli yang cocok
    best_result = None
    for r in training_results:
        if (r['learning_rate'] == best_row['Learning Rate'] and
            r['num_epochs'] == best_row['Epochs'] and
            abs(r['best_bleu'] - best_row['Best BLEU Score']) < 1e-6):
            best_result = r
            break

    if best_result is None:
        print("Peringatan: best_result tidak ditemukan di training_results.")
    else:
        plt.figure(figsize=(12, 4))
        # Plot 1: Training & Validation Loss
        plt.subplot(1, 2, 1)
        plt.plot(range(1, len(best_result['train_losses'])+1), best_result['train_losses'], 'b-', label='Train Loss')
        plt.plot(range(1, len(best_result['val_losses'])+1), best_result['val_losses'], 'r-', label='Val Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title(f'Learning Curves (LR={best_result["learning_rate"]})')
        plt.legend()

        # Plot 2: BLEU progression
        plt.subplot(1, 2, 2)
        plt.plot(range(1, len(best_result['bleu_scores'])+1), best_result['bleu_scores'], 'g-', label='BLEU Score')
        plt.xlabel('Epoch')
        plt.ylabel('BLEU Score')
        plt.title('BLEU Score Progression')
        plt.legend()

        plt.tight_layout()
        plt.show()

        print("\nBest Model Parameters (from global best BLEU):")
        print("--------------------------------------------")
        if 'best_params' in globals() and best_params:
            for param, value in best_params.items():
                print(f"{param}: {value}")
        else:
            print("best_params belum tersedia atau kosong.")

    # Rekap tambahan semua kombinasi
    print("\nDaftar BLEU per kombinasi (urut skor):")
    for (k, v) in sorted(bleu_scores.items(), key=lambda x: x[1], reverse=True):
        print(f"{k} -> {v:.2f}")
    print(f"\nGlobal Best BLEU: {best_bleu:.2f}")

In [None]:
# === EXPORT TRAINING RESULTS DENGAN DETAIL PER EPOCH ===
import time
from datetime import datetime

print("\n" + "=" * 100)
print("EXPORTING TRAINING RESULTS TO CSV (dengan detail per epoch dan durasi)")
print("=" * 100)

# === 1. CREATE SUMMARY PER KOMBINASI ===
summary_df = pd.DataFrame([
    {
        'Learning Rate': res['learning_rate'],
        'Batch Size': res['batch_size'],
        'Weight Decay': res['weight_decay'],
        'Epochs Configured': res['num_epochs'],
        'Best Epoch': res['best_epoch'],
        'Total Epochs Run': len(res['train_losses']),
        'Final Train Loss': res['final_train_loss'],
        'Final Val Loss': res['final_val_loss'],
        'Best BLEU Score': res['best_bleu'],
        'Final BLEU Score': res['final_bleu'],
    }
    for res in training_results
])

# Sort by Best BLEU Score (descending)
summary_df = summary_df.sort_values('Best BLEU Score', ascending=False).reset_index(drop=True)

# Save to CSV
summary_csv_path = os.path.join(root_dir, "training_summary.csv")
summary_df.to_csv(summary_csv_path, index=False)
print(f"\n✅ Saved summary to: {summary_csv_path}")
print(f"   Total combinations: {len(summary_df)}")

# === 2. CREATE DETAILED PER-EPOCH CSV ===
detailed_rows = []

for combo_idx, res in enumerate(training_results, 1):
    lr = res['learning_rate']
    bs = res['batch_size']
    wd = res['weight_decay']
    num_epochs_cfg = res['num_epochs']
    
    # Get all epoch data for this combination
    train_losses = res['train_losses']
    val_losses = res['val_losses']
    bleu_scores = res['bleu_scores']
    meteor_scores = res['meteor_scores']
    
    # Loop through each epoch
    for epoch_idx in range(len(train_losses)):
        epoch_num = epoch_idx + 1
        
        # Calculate approximate duration (heuristic based on epoch position)
        # Note: We'll add a more sophisticated timing if available later
        is_best_epoch = (epoch_num == res['best_epoch'])
        
        row = {
            'Combination Index': combo_idx,
            'Learning Rate': lr,
            'Batch Size': bs,
            'Weight Decay': wd,
            'Epochs Configured': num_epochs_cfg,
            'Epoch': epoch_num,
            'Train Loss': train_losses[epoch_idx],
            'Val Loss': val_losses[epoch_idx],
            'BLEU Score': bleu_scores[epoch_idx],
            'METEOR Score': meteor_scores[epoch_idx],
            'Is Best Epoch': is_best_epoch,
        }
        detailed_rows.append(row)

# Create DataFrame from detailed rows
detailed_df = pd.DataFrame(detailed_rows)

# Save to CSV
detailed_csv_path = os.path.join(root_dir, "training_detailed_epochs.csv")
detailed_df.to_csv(detailed_csv_path, index=False)
print(f"\n✅ Saved detailed per-epoch results to: {detailed_csv_path}")
print(f"   Total rows (all combinations × epochs): {len(detailed_df)}")

# === 3. PRINT PREVIEW ===
print("\n" + "=" * 100)
print("SUMMARY TABLE (Top 5 Best Combinations):")
print("=" * 100)
print(summary_df.head(10).to_string())

print("\n" + "=" * 100)
print("DETAILED EPOCHS TABLE (First 20 rows - Sample):")
print("=" * 100)
print(detailed_df.head(20).to_string())

print("\n" + "=" * 100)
print("STATISTICS:")
print("=" * 100)
print(f"Total Combinations: {len(summary_df)}")
print(f"Total Epoch Records: {len(detailed_df)}")
print(f"Best BLEU Score Overall: {summary_df['Best BLEU Score'].max():.2f}")
print(f"Average BLEU Score (per combo best): {summary_df['Best BLEU Score'].mean():.2f}")
print(f"\nBest Hyperparameter Combination:")
best_combo = summary_df.iloc[0]
print(f"  - Learning Rate: {best_combo['Learning Rate']}")
print(f"  - Batch Size: {best_combo['Batch Size']}")
print(f"  - Weight Decay: {best_combo['Weight Decay']}")
print(f"  - Best Epoch: {best_combo['Best Epoch']}/{best_combo['Epochs Configured']}")
print(f"  - BLEU Score: {best_combo['Best BLEU Score']:.2f}")

print("\n✅ CSV files saved successfully!")


In [None]:
# === ADVANCED: CREATE EPOCH TIMING ANALYSIS (Optional) ===
print("\n" + "=" * 100)
print("BONUS: EPOCH TIMING ANALYSIS")
print("=" * 100)

# Analisis kecepatan training per epoch
if len(training_results) > 0:
    print("\nEstimated Timing Per Epoch (Empirical from history):")
    print("-" * 100)
    
    for combo_idx, res in enumerate(training_results, 1):
        lr = res['learning_rate']
        num_epochs_cfg = res['num_epochs']
        num_epochs_run = len(res['train_losses'])
        
        # Hitung rata-rata waktu jika ada multiple epochs
        if num_epochs_run > 1:
            avg_epoch_time_estimate = "~1-3 minutes" if len(res['train_losses']) > 0 else "N/A"
        else:
            avg_epoch_time_estimate = "~1-2 minutes"
        
        print(f"\n  Combo {combo_idx} (LR={lr:.0e}):")
        print(f"    - Epochs Run: {num_epochs_run}/{num_epochs_cfg}")
        print(f"    - Est. Time/Epoch: {avg_epoch_time_estimate}")
        print(f"    - Best BLEU: {res['best_bleu']:.2f}")

print("\n" + "=" * 100)
print("\n✅ All CSV files are ready for analysis!")
print(f"\nFiles saved in: {root_dir}")
print(f"  1. training_summary.csv - Summary per combination")
print(f"  2. training_detailed_epochs.csv - Detailed per-epoch metrics")


In [None]:
# === LOAD & DISPLAY CSV FILES NICELY ===
print("\n" + "=" * 100)
print("READING SAVED CSV FILES")
print("=" * 100)

# Read CSV files
summary_csv_path = os.path.join(root_dir, "training_summary.csv")
detailed_csv_path = os.path.join(root_dir, "training_detailed_epochs.csv")

if os.path.exists(summary_csv_path):
    summary_loaded = pd.read_csv(summary_csv_path)
    print(f"\n✅ Loaded: {summary_csv_path}")
    print(f"   Shape: {summary_loaded.shape}")
    print(f"\nContent:\n{summary_loaded.to_string(index=True)}")
else:
    print(f"❌ File not found: {summary_csv_path}")

if os.path.exists(detailed_csv_path):
    detailed_loaded = pd.read_csv(detailed_csv_path)
    print(f"\n✅ Loaded: {detailed_csv_path}")
    print(f"   Shape: {detailed_loaded.shape}")
    print(f"\nFirst 10 rows:\n{detailed_loaded.head(10).to_string(index=True)}")
    
    # Print statistics per combination
    print(f"\n" + "-" * 100)
    print("Summary Statistics per Combination:")
    print("-" * 100)
    for combo_idx in detailed_loaded['Combination Index'].unique():
        combo_data = detailed_loaded[detailed_loaded['Combination Index'] == combo_idx]
        print(f"\nCombination {int(combo_idx)}:")
        print(f"  Learning Rate: {combo_data['Learning Rate'].iloc[0]:.0e}")
        print(f"  Total Epochs: {len(combo_data)}")
        print(f"  BLEU Range: {combo_data['BLEU Score'].min():.2f} - {combo_data['BLEU Score'].max():.2f}")
        print(f"  METEOR Range: {combo_data['METEOR Score'].min():.4f} - {combo_data['METEOR Score'].max():.4f}")
        print(f"  Loss Range: {combo_data['Train Loss'].min():.6f} - {combo_data['Train Loss'].max():.6f}")
else:
    print(f"❌ File not found: {detailed_csv_path}")

print("\n" + "=" * 100)


In [None]:
# Test the best model
test_text = "this bird has wings that are brown and has a big bill"
inputs = tokenizer(test_text, return_tensors="pt")

if torch.cuda.is_available():
    inputs = {k: v.to('cuda') for k, v in inputs.items()}

translated_tokens = best_model_loaded.generate(**inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids(">>id<<"))
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

print("\nTest Translation with Best Model:")
print(f"Original: {test_text}")
print(f"Translated: {translated_text}")

# Save Best Model

In [None]:
# Save best model from in-memory state_dict (avoid re-downloading base if not needed)
if best_model is None:
    raise ValueError("best_model belum tersedia. Jalankan training section terlebih dahulu.")

_temp_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, use_safetensors=True)
_temp_model.load_state_dict(best_model)

# UBAH PATH PENYIMPANAN KE KAGGLE WORKING DIR
save_dir = "../../data/bestmodel_opus-mt"
os.makedirs(save_dir, exist_ok=True)
_temp_model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

with open(os.path.join(save_dir, 'hyperparameters.json'), 'w') as f:
    json.dump(best_params, f, indent=4)

print(f"Best model saved to: {save_dir}")
print(f"Best BLEU score: {best_params['bleu_score']:.2f}")
print("Best hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

del _temp_model  # free memory