In [1]:
!pip install -q transformers datasets evaluate sentencepiece wandb accelerate

In [2]:
!pip install hf_transfer



In [3]:
import os
import wandb
import torch
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from evaluate import load



In [4]:
wandb.login(key="6afe11f68615fd2c34a35aa78d4b43e89001c527")

# ✅ Initialize your W&B project and run name
wandb.init(
    project="malay-english-translation",
    name="mt5_translation_run_rtx5090",
    config={
        "model": "google/mt5-base",
        "batch_size": 20,
        "epochs": 5,
        "learning_rate": 2e-5,
        "max_length": 256,
        "dataset_size":"5.8M"
    }
)

print("✅ W&B login successful and project initialized!")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkclin01[0m ([33mkoihaha[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


✅ W&B login successful and project initialized!


In [5]:
MODEL_NAME = "google/mt5-base"
DATA_PATH = "/workspace/malaysian_english"
OUTPUT_DIR = "/workspace/mt5_model"

MAX_SOURCE_LENGTH = 256
MAX_TARGET_LENGTH = 256
BATCH_SIZE = 20                     
LEARNING_RATE = 2e-5                    
EPOCHS = 5                              
LOGGING_STEPS = 500
SAVE_STEPS = 5000
EVAL_STEPS = 5000
GRADIENT_ACCUMULATION_STEPS = 3        
WARMUP_STEPS = 5000                     
LR_SCHEDULER_TYPE = "cosine"            

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
#load dataset clened
print("🔹 Loading dataset from disk...")
dataset = load_from_disk(DATA_PATH)

if not isinstance(dataset, dict):
    dataset = dataset.train_test_split(test_size=0.01, seed=42)
    print("✅ Split into train/test sets:", dataset)


🔹 Loading dataset from disk...


Loading dataset from disk:   0%|          | 0/30 [00:00<?, ?it/s]

✅ Split into train/test sets: DatasetDict({
    train: Dataset({
        features: ['src', 'tgt', 'prefix'],
        num_rows: 5761747
    })
    test: Dataset({
        features: ['src', 'tgt', 'prefix'],
        num_rows: 58200
    })
})


In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    inputs = [str(x) for x in examples["src"]]
    targets = [str(x) for x in examples["tgt"]]
    
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_SOURCE_LENGTH,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("🔹 Tokenizing dataset (this may take a while for 5.8M samples)...")
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=8,
    remove_columns=dataset["train"].column_names
)
print("✅ Tokenization complete!")

🔹 Tokenizing dataset (this may take a while for 5.8M samples)...
✅ Tokenization complete!


In [9]:
def print_gpu_memory():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"💾 GPU Memory: {allocated:.2f} GB / {total:.2f} GB")

print("🔹 Loading model with memory optimizations...")
print_gpu_memory()

🔹 Loading model with memory optimizations...
💾 GPU Memory: 0.00 GB / 31.37 GB


In [10]:
import torch
import gc
import os

# Clear memory before training
torch.cuda.empty_cache()
gc.collect()

# Reduce memory fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [11]:
#load model and data collator
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
)

# High-accuracy optimizations
model.gradient_checkpointing_enable()  
model.config.use_cache = False         
print_gpu_memory()

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model,
    pad_to_multiple_of=8  
)

💾 GPU Memory: 2.17 GB / 31.37 GB


In [12]:
!pip install sacrebleu



In [13]:
#evalution metrics
bleu = load("sacrebleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Handle both single references and multiple references
    if isinstance(labels[0], list):
        decoded_labels = [[tokenizer.decode(ref, skip_special_tokens=True) for ref in ref_list] for ref_list in labels]
    else:
        decoded_labels = [[tokenizer.decode(label, skip_special_tokens=True)] for label in labels]
    
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [14]:
pip install transformers --upgrade

Note: you may need to restart the kernel to use updated packages.


In [15]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=20,  
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=3,   
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    fp16=True,                       
    tf32=True,
    warmup_steps=WARMUP_STEPS,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    save_total_limit=2,  
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=["wandb"],
    run_name="mt5_translation_run_rtx5090",
    dataloader_num_workers=4,        
    dataloader_prefetch_factor=1,    
    dataloader_pin_memory=False,     
    logging_steps=2000,              
    remove_unused_columns=True,
    resume_from_checkpoint=False,  
    gradient_checkpointing=False,    
    optim="adamw_torch_fused",       
    logging_first_step=False,
    dataloader_drop_last=True,
    skip_memory_metrics=True,        
)

In [16]:
#trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
)

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [None]:
#start train
train_samples = len(tokenized_dataset["train"])
effective_batch_size = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
steps_per_epoch = train_samples // effective_batch_size
total_steps = steps_per_epoch * EPOCHS

print(f"🎯 HIGH ACCURACY TRAINING CONFIGURATION:")
print(f"   • Dataset: {train_samples:,} samples")
print(f"   • Batch Size: {BATCH_SIZE} (per device)")
print(f"   • Effective Batch: {effective_batch_size} (accumulated)")
print(f"   • Learning Rate: {LEARNING_RATE} (optimal for stability)")
print(f"   • Epochs: {EPOCHS}")
print(f"   • Total Steps: ~{total_steps:,}")
print(f"   • Warmup: {WARMUP_STEPS} steps")
print(f"   • Evaluation: Every {EVAL_STEPS} steps")

print_gpu_memory()

trainer.train()

🎯 HIGH ACCURACY TRAINING CONFIGURATION:
   • Dataset: 5,761,747 samples
   • Batch Size: 20 (per device)
   • Effective Batch: 60 (accumulated)
   • Learning Rate: 2e-05 (optimal for stability)
   • Epochs: 5
   • Total Steps: ~480,145
   • Warmup: 5000 steps
   • Evaluation: Every 5000 steps
💾 GPU Memory: 2.17 GB / 31.37 GB


Step,Training Loss,Validation Loss


In [None]:
#save final model
trainer.save_model(f"{OUTPUT_DIR}/high_accuracy_model")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/high_accuracy_model")

print(f"📁 Model saved to: {OUTPUT_DIR}/high_accuracy_model")
wandb.finish()

In [None]:
import os
import wandb
import torch
import sacrebleu
import numpy as np
from datasets import load_dataset, load_from_disk, DatasetDict
from sacrebleu import BLEU
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from evaluate import load

# Set environment variables for memory optimization
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

MODEL_NAME = "mesolitica/nanot5-small-malaysian-cased"  
DATA_PATH = "/workspace/malaysian_english_stage2_part1"  
OUTPUT_DIR = "/workspace/mt5_model"

MAX_SOURCE_LENGTH = 64
MAX_TARGET_LENGTH = 64
BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = 2
LEARNING_RATE = 5e-5
EPOCHS = 1
LOGGING_STEPS = 50
EVAL_STEPS = 200
SAVE_STEPS = 500
          
wandb.login(key="6afe11f68615fd2c34a35aa78d4b43e89001c527")

# ✅ Initialize your W&B project and run name
wandb.init(
    project="malay-english-translation",
    name="mt5_translation_run_rtx5090",
    config={
        "model": MODEL_NAME,
        "batch_size": BATCH_SIZE,
        "epochs": EPOCHS,
        "learning_rate": LEARNING_RATE,
        "max_length": 128,
        "training": "seq2seq_trainer"
    }
)

print("✅ W&B login successful and project initialized!")

from transformers import TrainerCallback  # ← ADD THIS IMPORT

class DebugCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics:
            print(f"🔍 EVAL DEBUG - Step {state.global_step}:")
            print(f"   • Loss: {metrics.get('eval_loss', 'N/A')}")
            print(f"   • BLEU: {metrics.get('eval_bleu', 'N/A')}")
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and 'loss' in logs:
            print(f"📈 TRAIN - Step {state.global_step}: Loss = {logs['loss']:.4f}")

# === DATA LOADING ===
def load_fresh_dataset():
    """Load dataset directly from Hugging Face"""
    print("📥 Loading fresh dataset from Hugging Face...")
    
    try:
        # Load directly from the source
        dataset = load_dataset(
            "mesolitica/Malaysian-Translation", 
            "stage2-part1",
            split='train'
        )
        
        print(f"📊 Original dataset: {len(dataset):,} samples")
        
        # Add prefixes (since we need this preprocessing)
        def add_prefix(example):
            src = str(example['src'])
            # Simple language detection for prefix
            if any('\u4e00' <= char <= '\u9fff' for char in src):
                prefix = 'terjemah Cina ke Bahasa Melayu: '
            elif any('\u0b80' <= char <= '\u0bff' for char in src):
                prefix = 'terjemah Tamil ke Bahasa Melayu: '
            elif any(char in 'ڽڬڠݢۏڔڎڃ' for char in src):
                prefix = 'terjemah Jawi ke Bahasa Melayu: '
            else:
                prefix = 'terjemah Inggeris ke Bahasa Melayu: '
            
            example['src'] = prefix + src
            return example
        
        # Apply prefixes to first 60k samples for faster processing
        print("🏷️ Adding language prefixes...")
        dataset = dataset.select(range(60000)).map(add_prefix)
        
        print(f"✅ Processed dataset: {len(dataset):,} samples")
        return dataset
        
    except Exception as e:
        print(f"❌ Error loading from HF: {e}")
        return create_fallback_dataset()

def create_fallback_dataset():
    """Create a fallback dataset if HF loading fails"""
    print("🔄 Creating fallback dataset...")
    
    # Sample translation pairs for immediate training
    test_data = {
        "src": [
            "terjemah Inggeris ke Bahasa Melayu: Hello, how are you?",
            "terjemah Inggeris ke Bahasa Melayu: Good morning everyone",
            "terjemah Inggeris ke Bahasa Melayu: What is your name?",
            "terjemah Inggeris ke Bahasa Melayu: Thank you very much",
            "terjemah Inggeris ke Bahasa Melayu: Where is the restaurant?",
            "terjemah Inggeris ke Bahasa Melayu: I would like to order food",
            "terjemah Inggeris ke Bahasa Melayu: How much does this cost?",
            "terjemah Inggeris ke Bahasa Melayu: Can you help me please?",
            "terjemah Inggeris ke Bahasa Melayu: What time is it now?",
            "terjemah Inggeris ke Bahasa Melayu: I need to go to the hospital",
            "terjemah Inggeris ke Bahasa Melayu: The weather is very nice today",
            "terjemah Inggeris ke Bahasa Melayu: How do I get to the airport?",
            "terjemah Inggeris ke Bahasa Melayu: What is this for?",
            "terjemah Inggeris ke Bahasa Melayu: I don't understand",
            "terjemah Inggeris ke Bahasa Melayu: Please speak slowly",
        ],
        "tgt": [
            "Hello, apa khabar?",
            "Selamat pagi semua",
            "Siapa nama awak?",
            "Terima kasih banyak-banyak",
            "Mana restoran?",
            "Saya nak order makanan",
            "Berapa harga ini?",
            "Boleh tolong saya?",
            "Pukul berapa sekarang?",
            "Saya perlu pergi ke hospital",
            "Cuaca hari ini sangat cantik",
            "Macam mana nak pergi ke lapangan terbang?",
            "Ini untuk apa?",
            "Saya tidak faham",
            "Tolong cakap perlahan-lahan",
        ]
    }
    
    from datasets import Dataset
    
    # Create larger dataset by repeating
    expanded_src = []
    expanded_tgt = []
    for i in range(100):  # 1500 samples total
        expanded_src.extend(test_data["src"])
        expanded_tgt.extend(test_data["tgt"])
    
    dataset = Dataset.from_dict({
        "src": expanded_src,
        "tgt": expanded_tgt
    })
    
    print(f"✅ Fallback dataset: {len(dataset):,} samples")
    return dataset

# Load the dataset
dataset = load_fresh_dataset()

# Split into train/test
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)

# Use appropriate sizes
train_size = min(50000, len(split_dataset["train"]))
test_size = min(5000, len(split_dataset["test"]))

train_subset = split_dataset["train"].select(range(train_size))
test_subset = split_dataset["test"].select(range(test_size))

final_dataset = DatasetDict({
    "train": train_subset,
    "test": test_subset
})

print(f"🎯 Final dataset: {len(train_subset):,} train, {len(test_subset):,} test")

print("🔧 Loading model and tokenizer...")

# Clear GPU memory first
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load model with memory optimizations
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    dtype=torch.float16,  # Load in half precision
)

print("✅ Model and tokenizer loaded successfully!")

def check_training_data():
    """Check if training data looks correct"""
    print("🔍 Checking training data samples...")
    
    sample = final_dataset["train"][0]
    input_text = sample["src"]
    target_text = sample["tgt"]
    
    print(f"📝 Input: {input_text[:100]}...")
    print(f"🎯 Target: {target_text[:100]}...")
    print(f"📊 Input length: {len(input_text)}, Target length: {len(target_text)}")

check_training_data()

def preprocess_function(examples):
    """Tokenize the examples"""
    inputs = [str(x) for x in examples["src"]]
    targets = [str(x) for x in examples["tgt"]]
    
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_SOURCE_LENGTH,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("🔹 Tokenizing dataset...")
tokenized_dataset = final_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=final_dataset["train"].column_names
)

print("✅ Tokenization completed!")

# === DATA COLLATOR ===
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    pad_to_multiple_of=8
)

def compute_metrics(eval_pred):
    """Compute BLEU score for evaluation - WITH OVERFLOW PROTECTION"""
    predictions, labels = eval_pred
    
    try:
        # **FIX: Clip predictions to valid token ID range**
        predictions = np.clip(predictions, 0, tokenizer.vocab_size - 1)
        
        # Replace -100 with pad token id in labels
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        
        # Decode predictions and labels
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        
        # Filter and clean
        filtered_preds = []
        filtered_refs = []
        
        for pred, ref in zip(decoded_preds, decoded_labels):
            pred_clean = pred.strip()
            ref_clean = ref.strip()
            
            if pred_clean and ref_clean:
                filtered_preds.append(pred_clean)
                filtered_refs.append(ref_clean)
        
        if not filtered_preds:
            return {"bleu": 0.0, "gen_len": 0.0}
        
        # Compute BLEU
        bleu_scorer = BLEU(tokenize='13a')
        bleu_result = bleu_scorer.corpus_score(filtered_preds, [filtered_refs])
        
        # Compute average generation length
        prediction_lens = [len(pred.split()) for pred in filtered_preds]
        avg_gen_len = np.mean(prediction_lens) if prediction_lens else 0.0
        
        print(f"✅ BLEU: {bleu_result.score:.4f}, Samples: {len(filtered_preds)}, Avg Len: {avg_gen_len:.1f}")
        
        return {"bleu": bleu_result.score, "gen_len": avg_gen_len}
        
    except Exception as e:
        print(f"❌ BLEU calculation failed: {e}")
        return {"bleu": 0.0, "gen_len": 0.0}

def check_model_outputs():
    """Check if model is producing reasonable outputs"""
    print("🔍 Checking model outputs...")
    
    # Move model to GPU first
    model.to("cuda")
    print("✅ Model moved to GPU")
    
    # Test with a simple example
    test_input = "terjemah Inggeris ke Bahasa Melayu: Hello world"
    inputs = tokenizer(test_input, return_tensors="pt", max_length=128, truncation=True).to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=50,
            num_beams=1,  # Use greedy for quick test
            # Remove early_stopping for greedy search
        )
    
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"📝 Test input: {test_input}")
    print(f"🎯 Model output: {prediction}")
    
    # Check if output is reasonable
    if len(prediction.strip()) > 0 and prediction != test_input:
        print("✅ Model is producing different outputs")
        return True
    else:
        print("❌ Model output issue detected")
        return False

# Run the check
check_model_outputs()

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    
    # TRAINING - ULTRA STABLE
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=8,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,           # Mild regularization
    warmup_steps=1000,           # More warmup
    max_grad_norm=0.5,           # Conservative gradient clipping
    
    # Evaluation
    eval_strategy="steps",  
    eval_steps=EVAL_STEPS,       # 200
    predict_with_generate=True,
    
    # Saving - FIXED: Make save_steps multiple of eval_steps
    save_strategy="steps",  
    save_steps=400,              # ← CHANGED from 500 to 400 (multiple of 200)
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    
    # Logging
    logging_strategy="steps",  
    logging_steps=LOGGING_STEPS,
    report_to=["wandb"],
    run_name="nanot5-small-stable-run",
    
    # Optimization - STABILITY FIRST
    fp16=False,                 # NO mixed precision
    bf16=False,                 # NO mixed precision  
    dataloader_num_workers=2,
    dataloader_pin_memory=False,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[DebugCallback()]  # Add debug callback
)

train_samples = len(tokenized_dataset["train"])
effective_batch_size = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
steps_per_epoch = train_samples // effective_batch_size
total_steps = steps_per_epoch * EPOCHS

print(f"🎯 FINAL TRAINING CONFIGURATION:")
print(f"   • Model: {MODEL_NAME}")
print(f"   • Dataset: {train_samples:,} train, {len(tokenized_dataset['test']):,} test")
print(f"   • Batch Size: {BATCH_SIZE} (effective: {effective_batch_size})")
print(f"   • Learning Rate: {LEARNING_RATE}")
print(f"   • Epochs: {EPOCHS}")
print(f"   • Total Steps: ~{total_steps:,}")
print(f"   • Evaluation: Every {EVAL_STEPS} steps")
print(f"   • Target BLEU: >10.0")

# Clear everything first
import torch
import gc
torch.cuda.empty_cache()
gc.collect()

print("🔄 Loading nanot5-small with stability checks...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load in FP32 for maximum stability
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    dtype=torch.float32,  # FP32 ONLY for stability
)

print("✅ nanot5-small loaded successfully!")
print(f"📊 Model parameters: {model.num_parameters():,}")

def quick_stability_check():
    """Quick stability test for T5 models"""
    print("🔍 Quick stability check for T5...")
    
    try:
        # Move model to GPU
        model.to("cuda")
        
        # Test input and target (T5 needs both for forward pass)
        test_input = "terjemah Inggeris ke Bahasa Melayu: Hello world"
        test_target = "Halo dunia"
        
        # Tokenize both input and target
        inputs = tokenizer(test_input, return_tensors="pt", max_length=64, truncation=True, padding="max_length")
        labels = tokenizer(test_target, return_tensors="pt", max_length=64, truncation=True, padding="max_length")
        
        # Remove token_type_ids for T5
        for d in [inputs, labels]:
            if 'token_type_ids' in d:
                del d['token_type_ids']
        
        # Move to GPU
        inputs = {k: v.to("cuda") for k, v in inputs.items()}
        labels = labels['input_ids'].to("cuda")
        
        # For T5 forward pass with loss, we need to provide labels
        inputs['labels'] = labels
        
        # Test forward pass
        with torch.no_grad():
            outputs = model(**inputs)
            loss = outputs.loss.item() if outputs.loss is not None else "N/A"
        
        print(f"📊 Forward loss: {loss}")
        
        # Test generation separately (this doesn't need labels)
        with torch.no_grad():
            generated = model.generate(
                inputs['input_ids'],
                max_length=32,
                num_beams=1
            )
            prediction = tokenizer.decode(generated[0], skip_special_tokens=True)
        
        print(f"🎯 Test generation: {prediction}")
        
        # Check if reasonable
        if isinstance(loss, float) and loss < 1000 and loss > 0:
            print("✅ Stability check passed!")
            return True
        else:
            print("❌ Loss issue detected")
            return False
            
    except Exception as e:
        print(f"❌ Error during stability check: {e}")
        return False

# Run the check
if quick_stability_check():
    print("🎉 Ready for stable training!")
else:
    print("🚨 Stability issues detected!")

import pandas as pd
from datasets import Dataset, DatasetDict

dataset_path = "/workspace/malaysian_english_stage2_part1"

print("🔄 Attempting to recreate from parquet file...")
try:
    # Load the parquet file that's in your directory
    parquet_path = os.path.join(dataset_path, "malaysian_translation_stage2_part1.parquet")
    
    if os.path.exists(parquet_path):
        print(f"📖 Loading parquet file: {parquet_path}")
        df = pd.read_parquet(parquet_path)
        print(f"✅ Loaded parquet: {len(df):,} rows, columns: {list(df.columns)}")
        
        # Convert to Hugging Face dataset
        full_dataset = Dataset.from_pandas(df)
        print(f"✅ Created dataset: {len(full_dataset):,} samples")
        
        # Save it properly
        print("💾 Saving as proper dataset...")
        full_dataset.save_to_disk(dataset_path + "_fixed")
        print("✅ Dataset saved successfully!")
        
    else:
        print(f"❌ Parquet file not found at: {parquet_path}")
        print("📁 Available files:")
        for file in os.listdir(dataset_path):
            print(f"   - {file}")
            
except Exception as e:
    print(f"❌ Error recreating from parquet: {e}")

print("🚀 STARTING TRAINING...")
torch.cuda.empty_cache()
training_result = trainer.train()

# Start training (this will run for several hours)
training_result = trainer.train()

# === SAVE MODEL ===
print("💾 Saving model...")
trainer.save_model()

print(f"📁 Model saved to: {OUTPUT_DIR}")

# === FINAL EVALUATION ===
print("🔍 Running final evaluation...")
final_metrics = trainer.evaluate()
print(f"📊 Final BLEU score: {final_metrics.get('eval_bleu', 0):.2f}")

wandb.finish()
print("✅ TRAINING COMPLETED!")

okie now is to fix the full dataset and remove those debug code