## Cell 1 — Setup (install & imports)

In [7]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import T5TokenizerFast, TFT5ForConditionalGeneration, DataCollatorForSeq2Seq
import gradio as gr
import random
from tqdm import tqdm

print("✅ All libraries imported")
print("TensorFlow version:", tf.__version__)


✅ All libraries imported
TensorFlow version: 2.12.0


## Cell 2: Load dataset


In [8]:
# Load the MedQuad CSV dataset
data_path = "../data/medquad.csv"
df = pd.read_csv(data_path)

# Preview dataset
df = df.dropna(subset=["question", "answer"]).reset_index(drop=True)
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (16407, 4)


Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma


## Cell 3: Split dataset

In [None]:
# Split into train/test
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


In [3]:
df = df.dropna(subset=['question', 'answer'])  # Remove missing values
df['question'] = df['question'].str.strip().str.lower()
df['answer'] = df['answer'].str.strip()

# Quick sanity check
print("Dataset size:", len(df))
df.head()

Dataset size: 16407


Unnamed: 0,question,answer,source,focus_area
0,what is (are) glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,what causes glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,what are the symptoms of glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,what are the treatments for glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,what is (are) glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma


## Cell 4: Train/test split


In [4]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

medquad_dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})
print(medquad_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'source', 'focus_area', '__index_level_0__'],
        num_rows: 14766
    })
    test: Dataset({
        features: ['question', 'answer', 'source', 'focus_area', '__index_level_0__'],
        num_rows: 1641
    })
})


## Tokenizer and preprocessing for T5

In [5]:
model_checkpoint = "t5-small"  # You can upgrade later to t5-base
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint)

max_input_length = 128
max_target_length = 64

def preprocess_function(examples):
    inputs = ["question: " + q for q in examples["question"]]
    targets = examples["answer"]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
        return_tensors="tf"
    )
    labels = tokenizer(
        targets,
        max_length=max_target_length,
        padding="max_length",
        truncation=True,
        return_tensors="tf"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = medquad_dataset["train"].map(preprocess_function, batched=True)
tokenized_test = medquad_dataset["test"].map(preprocess_function, batched=True)

                                                                   

## Prepare TensorFlow datasets


In [None]:
# ===============================
# Cell 6: Prepare TensorFlow datasets
# ===============================

import tensorflow as tf
from transformers import T5TokenizerFast

# Assume train_df and test_df are already loaded from medquad.csv
# Columns: "question" (input), "answer" (target)

# Initialize T5 tokenizer
tokenizer = T5TokenizerFast.from_pretrained("t5-small")

# Max sequence lengths
max_input_length = 128
max_target_length = 64

# -------------------------------
# Tokenization function
# -------------------------------
def tokenize_function(examples):
    # Tokenize inputs
    model_inputs = tokenizer(
        examples["question"],
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
        return_tensors=None  # Return Python lists
    )
    # Tokenize targets
    labels = tokenizer(
        examples["answer"],
        max_length=max_target_length,
        padding="max_length",
        truncation=True,
        return_tensors=None
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# -------------------------------
# Apply tokenization
# -------------------------------
tokenized_train = tokenize_function({
    "question": train_df["question"].tolist(),
    "answer": train_df["answer"].tolist()
})

tokenized_test = tokenize_function({
    "question": test_df["question"].tolist(),
    "answer": test_df["answer"].tolist()
})

# -------------------------------
# Convert lists to TensorFlow tensors
# -------------------------------
train_features = {
    "input_ids": tf.constant(tokenized_train["input_ids"], dtype=tf.int32),
    "attention_mask": tf.constant(tokenized_train["attention_mask"], dtype=tf.int32)
}
train_labels = tf.constant(tokenized_train["labels"], dtype=tf.int32)

test_features = {
    "input_ids": tf.constant(tokenized_test["input_ids"], dtype=tf.int32),
    "attention_mask": tf.constant(tokenized_test["attention_mask"], dtype=tf.int32)
}
test_labels = tf.constant(tokenized_test["labels"], dtype=tf.int32)

# -------------------------------
# Create TensorFlow Dataset objects
# -------------------------------
batch_size = 16
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_labels))

train_dataset = train_dataset.shuffle(buffer_size=len(train_df)).batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

print("✅ TensorFlow datasets prepared successfully!")
print("Train batches:", len(list(train_dataset)))
print("Test batches:", len(list(test_dataset)))


AttributeError: 'list' object has no attribute 'to_tensor'