# Answer Generation

In [None]:
!pip install transformers torch sentencepiece datasets

In [None]:
import random
import time
import torch
import numpy as np
from transformers import (
    T5ForConditionalGeneration,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from datasets import load_dataset, Dataset

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

BATCH_SIZE = 8
QA_EPOCHS = 3
BOOL_EPOCHS = 3
QA_LR = 3e-5
BOOL_LR = 2e-5
MAX_INPUT_LEN = 512
MAX_OUTPUT_LEN = 32
MAX_TRAIN_SAMPLES = 10000  # subsample for Colab feasibility

In [None]:
squad = load_dataset("squad_v2")
print(squad)
print(f"Train examples: {len(squad['train'])}")
print(f"Validation examples: {len(squad['validation'])}")

# Fine-Tune T5-Base (Extractive QA)

In [None]:
qa_tokenizer = AutoTokenizer.from_pretrained("t5-base")
qa_model_ft = T5ForConditionalGeneration.from_pretrained("t5-base")

train_data = squad["train"].shuffle(seed=SEED).select(range(min(MAX_TRAIN_SAMPLES, len(squad["train"]))))
val_data = squad["validation"].shuffle(seed=SEED).select(range(min(MAX_TRAIN_SAMPLES // 5, len(squad["validation"]))))

def preprocess_qa(examples):
    inputs = [
        f"question: {q}  context: {c}"
        for q, c in zip(examples["question"], examples["context"])
    ]
    targets = [
        ans["text"][0] if len(ans["text"]) > 0 else ""
        for ans in examples["answers"]
    ]

    model_inputs = qa_tokenizer(
        inputs, max_length=MAX_INPUT_LEN, truncation=True, padding="max_length"
    )
    labels = qa_tokenizer(
        targets, max_length=MAX_OUTPUT_LEN, truncation=True, padding="max_length"
    )
    # Replace pad token ids with -100 so they are ignored in loss
    label_ids = [
        [(tok if tok != qa_tokenizer.pad_token_id else -100) for tok in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = label_ids
    return model_inputs


train_dataset = train_data.map(preprocess_qa, batched=True, remove_columns=train_data.column_names)
val_dataset = val_data.map(preprocess_qa, batched=True, remove_columns=val_data.column_names)

training_args = TrainingArguments(
    output_dir="./t5_base_squad2_checkpoints",
    num_train_epochs=QA_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=QA_LR,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    seed=SEED,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=qa_model_ft,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print("Starting T5-Base fine-tuning on SQuAD v2...")
trainer.train()

qa_model_ft.save_pretrained("./t5_base_squad2/")
qa_tokenizer.save_pretrained("./t5_base_squad2/")
print("T5-Base saved to ./t5_base_squad2/")

# Fine-Tune RoBERTa-Base (Boolean Classification)

In [None]:
answerable = [ex for ex in squad["train"] if len(ex["answers"]["text"]) > 0]
all_contexts = [ex["context"] for ex in answerable]
print(f"Answerable examples: {len(answerable)}")

true_pairs = []
false_pairs = []

sampled = answerable[:min(MAX_TRAIN_SAMPLES, len(answerable))]

for ex in sampled:
    question = ex["question"]
    correct_context = ex["context"]

    true_pairs.append({"question": question, "passage": correct_context, "label": 1})

    wrong_context = correct_context
    while wrong_context == correct_context:
        wrong_context = random.choice(all_contexts)
    false_pairs.append({"question": question, "passage": wrong_context, "label": 0})

bool_data = true_pairs + false_pairs
random.shuffle(bool_data)

# Split 90/10 train/val
split_idx = int(len(bool_data) * 0.9)
bool_train = Dataset.from_list(bool_data[:split_idx])
bool_val = Dataset.from_list(bool_data[split_idx:])

print(f"Boolean train size: {len(bool_train)}")
print(f"Boolean val size: {len(bool_val)}")
print(f"Label distribution (train): {sum(1 for x in bool_data[:split_idx] if x['label']==1)} true, "
      f"{sum(1 for x in bool_data[:split_idx] if x['label']==0)} false")

In [None]:
bool_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
bool_model_ft = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)


def preprocess_bool(examples):
    return bool_tokenizer(
        examples["question"],
        examples["passage"],
        max_length=MAX_INPUT_LEN,
        truncation=True,
        padding="max_length",
    )


bool_train_tok = bool_train.map(preprocess_bool, batched=True, remove_columns=["question", "passage"])
bool_val_tok = bool_val.map(preprocess_bool, batched=True, remove_columns=["question", "passage"])

bool_training_args = TrainingArguments(
    output_dir="./roberta_base_boolq_checkpoints",
    num_train_epochs=BOOL_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=BOOL_LR,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    seed=SEED,
    fp16=torch.cuda.is_available(),
)

bool_trainer = Trainer(
    model=bool_model_ft,
    args=bool_training_args,
    train_dataset=bool_train_tok,
    eval_dataset=bool_val_tok,
    data_collator=DataCollatorWithPadding(bool_tokenizer),
)

print("Starting RoBERTa-Base fine-tuning on boolean pairs...")
bool_trainer.train()

bool_model_ft.save_pretrained("./roberta_base_boolq/")
bool_tokenizer.save_pretrained("./roberta_base_boolq/")
print("RoBERTa-Base saved to ./roberta_base_boolq/")

# Answer Generation

In [None]:
qa_model_path = './t5_base_squad2/'
qa_tokenizer_path = './t5_base_squad2/'
bool_model_path = './roberta_base_boolq/'
bool_tokenizer_path = './roberta_base_boolq/'

In [None]:
qa_model = T5ForConditionalGeneration.from_pretrained(qa_model_path).to(DEVICE)
qa_tokenizer = AutoTokenizer.from_pretrained(qa_tokenizer_path)

bool_model = AutoModelForSequenceClassification.from_pretrained(bool_model_path).to(DEVICE)
bool_tokenizer = AutoTokenizer.from_pretrained(bool_tokenizer_path)

qa_model.eval()
bool_model.eval()
print("Fine-tuned models loaded successfully.")

In [None]:
def extract_answer(question, context, model=qa_model, tokenizer=qa_tokenizer):
    input_text = f"question: {question}  context: {context}"
    encoding = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LEN)
    input_ids = encoding["input_ids"].to(DEVICE)
    attention_mask = encoding["attention_mask"].to(DEVICE)
    with torch.no_grad():
        output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=MAX_OUTPUT_LEN)
    answer = tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return answer.strip().capitalize()


def beam_search_decoding(question, context, model=qa_model, tokenizer=qa_tokenizer):
    input_text = f"question: {question}  context: {context}"
    encoding = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LEN)
    input_ids = encoding["input_ids"].to(DEVICE)
    attention_mask = encoding["attention_mask"].to(DEVICE)
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=MAX_OUTPUT_LEN,
            num_beams=10,
            num_return_sequences=3,
            no_repeat_ngram_size=2,
            early_stopping=True,
        )
    answers = [tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) for out in output]
    return [a.strip().capitalize() for a in answers]


def topkp_decoding(question, context, model=qa_model, tokenizer=qa_tokenizer):
    input_text = f"question: {question}  context: {context}"
    encoding = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LEN)
    input_ids = encoding["input_ids"].to(DEVICE)
    attention_mask = encoding["attention_mask"].to(DEVICE)
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=MAX_OUTPUT_LEN,
            do_sample=True,
            top_k=40,
            top_p=0.80,
            num_return_sequences=3,
            no_repeat_ngram_size=2,
            early_stopping=True,
        )
    answers = [tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) for out in output]
    return [a.strip().capitalize() for a in answers]


def classify_true_false(question, passage, model=bool_model, tokenizer=bool_tokenizer):
    inputs = tokenizer(question, passage, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LEN).to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=-1)[0]
    label = torch.argmax(probs).item()
    confidence = probs[label].item()
    return {"answer": "True" if label == 1 else "False", "confidence": round(confidence, 3)}