# Tudlo: Cebuano Math Chatbot (Upgraded)

This notebook implements the **Tudlo** system for a Cebuano Math Chatbot, based on the research paper *"Tudlo: Parameter-Efficient Fine-tuning of a Multilingual Transformer for Cebuano Mathematics Education"*.

## Key Upgrades Implemented:
1.  **Model**: Upgraded from `mT5-small` to **`mT5-base`** for enhanced reasoning capabilities.
2.  **Data Quality**: Implemented an **Automated Back-Translation Loop** using **GoogleTrans** (English -> Cebuano -> English) and **Semantic Similarity Filtering** to ensure high-quality training data.
3.  **Evaluation**: Added **Numerical Exact Match** metric alongside standard text generation.

## 1. Dependencies
Installing necessary libraries for QLoRA, translation, and training.
**Note**: We use `googletrans==4.0.0-rc1` for API stability.

In [None]:
!pip install -q transformers peft bitsandbytes accelerate datasets sentencepiece sacremoses sentence-transformers googletrans==4.0.0-rc1

## 2. Data Pipeline (The Upgrade)
We load the GSM8K dataset and use **GoogleTrans** for translation. We filter the results using **Semantic Similarity** to retain only the best pairs. 
**Note**: Since we are using an external API, we process sequentially with delays to avoid rate limiting.

In [None]:
from datasets import load_dataset, Dataset
from googletrans import Translator
from sentence_transformers import SentenceTransformer, util
import torch
from tqdm.auto import tqdm
import time
import random

# 1. Load GSM8K (Scaled up to 500 examples)
gsm8k = load_dataset("gsm8k", "main", split="train[:500]")

# 2. Initialize Google Translator
translator = Translator()

# 3. Load Semantic Similarity Model
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Loading Semantic Similarity Model...")
sim_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device=device)

def translate_google(text, src, dest, retries=3):
    for i in range(retries):
        try:
            # Add delay to avoid rate limiting
            time.sleep(0.5 + random.random()) 
            result = translator.translate(text, src=src, dest=dest)
            return result.text
        except Exception as e:
            if i == retries - 1:
                print(f"Failed to translate: {e}")
                return None
            time.sleep(2) # Wait longer before retry
    return None

def get_semantic_similarity(text1, text2):
    emb1 = sim_model.encode(text1, convert_to_tensor=True)
    emb2 = sim_model.encode(text2, convert_to_tensor=True)
    return util.pytorch_cos_sim(emb1, emb2).item()

# 4. Back-Translation Loop (Sequential)
cebuano_data = []
similarity_threshold = 0.75

print("Starting Back-Translation Loop (this may take a while due to API delays)...")

for example in tqdm(gsm8k):
    question_en = example['question']
    answer_en = example['answer']
    
    # Translate Question: En -> Ceb -> En
    q_ceb = translate_google(question_en, src='en', dest='ceb')
    if not q_ceb: continue
    
    q_back = translate_google(q_ceb, src='ceb', dest='en')
    if not q_back: continue
    
    # Translate Answer: En -> Ceb -> En
    a_ceb = translate_google(answer_en, src='en', dest='ceb')
    if not a_ceb: continue
    
    a_back = translate_google(a_ceb, src='ceb', dest='en')
    if not a_back: continue
    
    # Calculate Similarity
    score_q = get_semantic_similarity(question_en, q_back)
    score_a = get_semantic_similarity(answer_en, a_back)
    
    if score_q > similarity_threshold and score_a > similarity_threshold:
        formatted_prompt = f"Ipasabot ang solusyon sa matematika sa Cebuano: {q_ceb}"
        cebuano_data.append({
            "input_text": formatted_prompt,
            "target_text": a_ceb,
            "original_q_score": score_q
        })

print(f"Retained {len(cebuano_data)}/{len(gsm8k)} high-quality pairs.")

# Create HF Dataset
dataset = Dataset.from_list(cebuano_data)
dataset = dataset.train_test_split(test_size=0.1)

# Data Inspection
print("\n--- Data Inspection ---")
if len(dataset['train']) > 0:
    for i in range(min(3, len(dataset['train']))):
        ex = dataset['train'][i]
        print(f"Example {i+1}:")
        print(f"Input: {ex['input_text']}")
        print(f"Target: {ex['target_text']}")
        print("-"*20)
else:
    print("No data retained. Check API connectivity or lower threshold.")

## 3. QLoRA Configuration
Configuring `mT5-base` with 4-bit quantization and LoRA adapters.

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

model_id = "google/mt5-base"

# 4-bit Quantization Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load Model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

model = prepare_model_for_kbit_training(model)

# LoRA Config (from paper: r=8, alpha=16, dropout=0.05)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## 4. Training
Setting up the Seq2SeqTrainer with hyperparameters from the paper.

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

# Tokenize Dataset
def preprocess_function(examples):
    inputs = [ex for ex in examples["input_text"]]
    targets = [ex for ex in examples["target_text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Training Arguments
# Paper: Batch 4, Grad Accum 4, LR 2e-4, 10 Epochs
# Tuned: LR 1e-4 to prevent overfitting on small data
training_args = Seq2SeqTrainingArguments(
    output_dir="./tudlo_checkpoints",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    num_train_epochs=10,
    logging_steps=10,
    optim="paged_adamw_32bit",
    fp16=True,
    save_strategy="epoch",
    eval_strategy="epoch",
    predict_with_generate=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

print("Starting Training...")
trainer.train()

## 5. Inference & Evaluation
Generating answers and performing Numerical Exact Match evaluation.

In [None]:
import re

def extract_number(text):
    # Simple regex to find the last number in the text, handling decimals
    # This is a heuristic for GSM8K where the answer is usually at the end
    matches = re.findall(r"[-+]?\d*\.\d+|\d+", text)
    if matches:
        return float(matches[-1])
    return None

def generate_answer(question_text):
    inputs = tokenizer(question_text, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Running Evaluation on Test Set...")
correct = 0
total = 0

if len(tokenized_datasets["test"]) > 0:
    for example in tokenized_datasets["test"]:
        # Note: We need the raw text for generation, not the tokenized version
        # So we'll grab from the original dataset slice corresponding to the test set
        # For simplicity here, we'll just demonstrate with a manual call or iterate the raw dataset if indices matched.
        # Let's just pick a few examples from the raw test split:
        pass

    # Better loop for eval:
    test_data = dataset["test"]
    for i in range(len(test_data)):
        input_text = test_data[i]["input_text"]
        target_text = test_data[i]["target_text"]
        
        generated_text = generate_answer(input_text)
        
        pred_num = extract_number(generated_text)
        ref_num = extract_number(target_text)
        
        is_match = (pred_num is not None) and (ref_num is not None) and (abs(pred_num - ref_num) < 1e-6)
        if is_match:
            correct += 1
        total += 1
        
        print(f"Q: {input_text}")
        print(f"Gen: {generated_text}")
        print(f"Ref: {target_text}")
        print(f"Match: {is_match} (Pred: {pred_num}, Ref: {ref_num})")
        print("-"*20)

    print(f"Final Accuracy: {correct/total:.2%}")
else:
    print("Test set is empty. Cannot evaluate.")