In [7]:
import os
import wandb
import torch
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from sacrebleu import BLEU
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    TrainerCallback
)
from evaluate import load

In [8]:
# Set environment variables for memory optimization
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [9]:
MODEL_NAME = "mesolitica/nanot5-small-malaysian-cased"  
DATA_PATH = "/workspace/malaysian_english_stage2_part1"  
OUTPUT_DIR = "/workspace/mt5_model"

MAX_SOURCE_LENGTH = 128
MAX_TARGET_LENGTH = 128
BATCH_SIZE = 64
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 3e-4
EPOCHS = 3
LOGGING_STEPS = 50
EVAL_STEPS = 200
SAVE_STEPS = 500
          

In [10]:
wandb.login(key="6afe11f68615fd2c34a35aa78d4b43e89001c527")

# ✅ Initialize your W&B project and run name
wandb.init(
    project="malay-english-translation",
    name="mt5_translation_run_rtx5090",
    config={
        "model": MODEL_NAME,
        "batch_size": BATCH_SIZE,
        "epochs": EPOCHS,
        "learning_rate": LEARNING_RATE,
        "max_length": 128,
        "training": "seq2seq_trainer"
    }
)

print("✅ W&B login successful and project initialized!")



✅ W&B login successful and project initialized!


In [11]:
class LoggingCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics:
            print(f"📊 EVAL - Step {state.global_step}:")
            print(f"   • Loss: {metrics.get('eval_loss', 'N/A')}")
            print(f"   • BLEU: {metrics.get('eval_bleu', 'N/A')}")
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and 'loss' in logs:
            print(f"📈 TRAIN - Step {state.global_step}: Loss = {logs['loss']:.4f}")

In [12]:
print("📥 Creating dataset directly from parquet...")
parquet_path = os.path.join(DATA_PATH, "malaysian_translation_stage2_part1.parquet")

try:
    # Load the parquet file
    df = pd.read_parquet(parquet_path)
    print(f"✅ Loaded parquet: {len(df):,} rows, columns: {list(df.columns)}")
    
    # Create dataset directly without saving to disk
    full_dataset = Dataset.from_pandas(df)
    print(f"✅ Created dataset: {len(full_dataset):,} samples")
    
except Exception as e:
    print(f"❌ Error loading parquet: {e}")
    print("📁 Available files:")
    for file in os.listdir(DATA_PATH):
        print(f"   - {file}")
    raise

print(f"📊 Original dataset: {len(full_dataset):,} samples")

# Split into train/validation
split_dataset = full_dataset.train_test_split(test_size=0.1, seed=42)

final_dataset = DatasetDict({
    "train": split_dataset["train"],
    "test": split_dataset["test"]
})

print(f"🎯 Final dataset: {len(final_dataset['train']):,} train, {len(final_dataset['test']):,} test")

📥 Creating dataset directly from parquet...
✅ Loaded parquet: 173,799 rows, columns: ['src', 'tgt']
✅ Created dataset: 173,799 samples
📊 Original dataset: 173,799 samples
🎯 Final dataset: 156,419 train, 17,380 test


In [14]:
print("🔧 Loading model and tokenizer...")
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

print("✅ Model and tokenizer loaded successfully!")

🔧 Loading model and tokenizer...
✅ Model and tokenizer loaded successfully!


In [None]:
#tokenization before trainning
def preprocess_function(examples):
    inputs = [str(x) for x in examples["src"]]
    targets = [str(x) for x in examples["tgt"]]
    
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_SOURCE_LENGTH,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("🔹 Tokenizing dataset...")
tokenized_dataset = final_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=final_dataset["train"].column_names
)

print("✅ Tokenization completed!")

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    pad_to_multiple_of=8
)

🔹 Tokenizing dataset...


Map:   0%|          | 0/156419 [00:00<?, ? examples/s]

Map:   0%|          | 0/17380 [00:00<?, ? examples/s]

✅ Tokenization completed!


In [None]:
#BLUE = measure how clsoe to a machine translation sentense to human
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    try:
        # Clip predictions to valid token ID range
        predictions = np.clip(predictions, 0, tokenizer.vocab_size - 1)
        
        # Replace -100 with pad token id in labels
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        
        #Token id turn back to human readable sentences
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        
        # Filter and clean
        filtered_preds = []
        filtered_refs = []
        
        for pred, ref in zip(decoded_preds, decoded_labels):
            pred_clean = pred.strip()
            ref_clean = ref.strip()
            
            if pred_clean and ref_clean:
                filtered_preds.append(pred_clean)
                filtered_refs.append(ref_clean)
        
        if not filtered_preds:
            return {"bleu": 0.0, "gen_len": 0.0}
        
        # Compute BLEU
        bleu_scorer = BLEU(tokenize='13a')
        bleu_result = bleu_scorer.corpus_score(filtered_preds, [filtered_refs])
        
        # Compute average generation length
        prediction_lens = [len(pred.split()) for pred in filtered_preds]
        avg_gen_len = np.mean(prediction_lens) if prediction_lens else 0.0
        
        return {"bleu": bleu_result.score, "gen_len": avg_gen_len}
        
    except Exception as e:
        print(f"❌ BLEU calculation failed: {e}")
        return {"bleu": 0.0, "gen_len": 0.0}

In [None]:
# === TRAINING ARGUMENTS - OPTIMIZED FOR GPU UTILIZATION ===
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    
    # Training - OPTIMIZED FOR GPU
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=32,
    learning_rate=LEARNING_RATE,
    #prevent overfitting
    weight_decay=0.01,
    #start with stable training
    warmup_steps=500,
    max_grad_norm=1.0,
    
    # Mixed Precision for Better GPU Usage
    fp16=True,
    
    # Evaluation
    eval_strategy="steps",  
    eval_steps=EVAL_STEPS,  # 200
    predict_with_generate=True,
    
    save_strategy="steps",  
    save_steps=400,  
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    
    # Logging
    logging_strategy="steps",  
    logging_steps=LOGGING_STEPS,
    report_to=["wandb"],
    run_name="nanot5-gpu-optimized",
    
    # Optimization - GPU FOCUSED
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    remove_unused_columns=True,
    label_names=["labels"],
    
    push_to_hub=False,
)

In [19]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[LoggingCallback()]
)

  trainer = Seq2SeqTrainer(


In [20]:
train_samples = len(tokenized_dataset["train"])
effective_batch_size = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
steps_per_epoch = train_samples // effective_batch_size
total_steps = steps_per_epoch * EPOCHS

print(f"🎯 TRAINING CONFIGURATION:")
print(f"   • Dataset: {train_samples:,} train samples")
print(f"   • Batch Size: {BATCH_SIZE} (effective: {effective_batch_size})")
print(f"   • Learning Rate: {LEARNING_RATE}")
print(f"   • Epochs: {EPOCHS}")
print(f"   • Total Steps: ~{total_steps:,}")

🎯 TRAINING CONFIGURATION:
   • Dataset: 156,419 train samples
   • Batch Size: 64 (effective: 64)
   • Learning Rate: 0.0003
   • Epochs: 3
   • Total Steps: ~7,332


In [21]:
print("🚀 STARTING TRAINING...")
torch.cuda.empty_cache()

training_result = trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1, 'pad_token_id': 1}.


🚀 STARTING TRAINING...


Step,Training Loss,Validation Loss,Bleu,Gen Len
200,3.214,3.008575,0.195091,12.564557
400,2.4364,2.366275,0.193646,13.697353
600,2.1199,2.076819,0.226611,13.719448
800,1.9612,1.935414,0.270594,14.12641
1000,1.892,1.83795,0.26806,14.171692
1200,1.7828,1.769829,0.267691,13.963291
1400,1.7431,1.722468,0.282387,14.043843
1600,1.71,1.683757,0.262629,13.844074
1800,1.6463,1.649525,0.298375,14.088435
2000,1.6386,1.612836,0.289047,14.06939


📈 TRAIN - Step 50: Loss = 9.2281
📈 TRAIN - Step 100: Loss = 5.2174
📈 TRAIN - Step 150: Loss = 3.7644
📈 TRAIN - Step 200: Loss = 3.2140
📊 EVAL - Step 200:
   • Loss: 3.008574962615967
   • BLEU: 0.19509063968246593
📈 TRAIN - Step 250: Loss = 2.8953
📈 TRAIN - Step 300: Loss = 2.6861
📈 TRAIN - Step 350: Loss = 2.5537
📈 TRAIN - Step 400: Loss = 2.4364
📊 EVAL - Step 400:
   • Loss: 2.3662750720977783
   • BLEU: 0.19364599542687866
📈 TRAIN - Step 450: Loss = 2.3615
📈 TRAIN - Step 500: Loss = 2.2530
📈 TRAIN - Step 550: Loss = 2.1508
📈 TRAIN - Step 600: Loss = 2.1199
📊 EVAL - Step 600:
   • Loss: 2.0768187046051025
   • BLEU: 0.22661080403300918
📈 TRAIN - Step 650: Loss = 2.0528
📈 TRAIN - Step 700: Loss = 2.0468
📈 TRAIN - Step 750: Loss = 1.9769
📈 TRAIN - Step 800: Loss = 1.9612
📊 EVAL - Step 800:
   • Loss: 1.9354138374328613
   • BLEU: 0.270594482571157
📈 TRAIN - Step 850: Loss = 1.9357
📈 TRAIN - Step 900: Loss = 1.9100
📈 TRAIN - Step 950: Loss = 1.8958
📈 TRAIN - Step 1000: Loss = 1.8920
📊 E

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


In [22]:
print("💾 Saving model...")
trainer.save_model()
print(f"📁 Model saved to: {OUTPUT_DIR}")

💾 Saving model...
📁 Model saved to: /workspace/mt5_model


In [23]:
print("🔍 Running final evaluation...")
final_metrics = trainer.evaluate()
print(f"📊 Final BLEU score: {final_metrics.get('eval_bleu', 0):.2f}")

wandb.finish()
print("✅ TRAINING COMPLETED!")

🔍 Running final evaluation...


📊 EVAL - Step 7335:
   • Loss: 1.4450196027755737
   • BLEU: 0.3273690020174198
📊 Final BLEU score: 0.33


0,1
eval/bleu,▁▁▃▅▅▅▆▅▆▆▆▅█▇▆▇▆▇▇▆▇█▇▇▇▆▇▇▇▇█▆▇▇▇▇█
eval/gen_len,▁▆▆██▇▇▇███▇█▇▇▇▇█▇▇▇█▇▇▇▇▇▇▇▇▇▇▇▇▇▇█
eval/loss,█▅▄▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▂▄▂▃▁▆▃▃▂▄▄▄▃▃▃▃▇▆█▅▅▃▃▅▂▂▂▅▅▆▆▇▅▆▅▅▆
eval/samples_per_second,▇▅▇▆█▃▆▆▇▅▅▅▆▆▆▆▂▃▁▄▄▅▆▄▇▇▇▄▄▃▃▂▄▃▄▄▃
eval/steps_per_second,▇▅▇▆█▃▆▆▇▅▅▅▆▆▆▆▂▃▁▄▄▆▆▄▇▇▇▄▄▃▃▂▄▃▄▄▃
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇█
train/grad_norm,█▂▁▁▂▁▁▂▁▁▁▁▁▁▁▁▂▂▁▁▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▃▇█▇▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁

0,1
eval/bleu,0.32737
eval/gen_len,14.0618
eval/loss,1.44502
eval/runtime,143.8056
eval/samples_per_second,120.858
eval/steps_per_second,3.783
total_flos,2.6340507751809024e+16
train/epoch,3
train/global_step,7335
train/grad_norm,5.86013


✅ TRAINING COMPLETED!
