In [None]:
# 1. SETUP
!pip install -q transformers datasets accelerate sentencepiece torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
from huggingface_hub import notebook_login
import torch

# 2. LOGIN
notebook_login()

# 3. CONFIGURATION (Using the robust Base model)
MODEL_CHECKPOINT = "google-t5/t5-base"
OUTPUT_DIR = "t5-base-sql-custom"
HUB_MODEL_ID = "hmyunis/t5-base-sql-custom"

# 4. LOAD & PREPARE
dataset = load_dataset('json', data_files='train.json', split='train')
tokenizer = T5Tokenizer.from_pretrained(MODEL_CHECKPOINT)
model = T5ForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)

def preprocess_function(examples):
    # T5 prefers "translate English to SQL: ..." prefix
    inputs = [f"translate English to SQL: {q} </s> {c}" for q, c in zip(examples['question'], examples['context'])]
    targets = examples['answer']

    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    # Replace padding token id with -100 so we don't train on padding
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)
# Split 90% train, 10% test
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.1)

# 5. TRAIN (Carefully tuned hyperparameters)
args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    learning_rate=3e-4, # Slightly higher for T5-base to kickstart it
    per_device_train_batch_size=4, # Smaller batch for stability
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=12,
    predict_with_generate=True,
    push_to_hub=True,
    hub_model_id=HUB_MODEL_ID,
    fp16=True,
    logging_steps=10
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    tokenizer=tokenizer,
)

print("Starting Training...")
trainer.train()

# 6. VALIDATE BEFORE PUSHING (Sanity Check)
print("Running Sanity Check...")
input_text = "translate English to SQL: Show me all customers </s> api_customer: id, name"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
outputs = model.generate(input_ids, max_length=100)
print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))

# 7. PUSH
trainer.push_to_hub()
print(f"Success! Model pushed to: https://huggingface.co/{HUB_MODEL_ID}")