## Cell 1: Install required packages


In [2]:
!pip install -q transformers==4.35 datasets==2.13 sacrebleu sentencepiece gradio kaggle tensorflow==2.12 pyarrow==11.0.0


## Cell 2: Imports and environment info


In [14]:
import sys
import os
import json
import random
from pathlib import Path
from IPython.display import display

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import sacrebleu
from tqdm import tqdm

from datasets import load_dataset
from transformers import (
    T5TokenizerFast,
    TFT5ForConditionalGeneration,
    DataCollatorForSeq2Seq
)
DATA_PATH = "../data/medquad.csv"
MODEL_NAME = "t5-small"

print("Python:", sys.version)
print("TensorFlow:", tf.__version__)


Python: 3.10.13 (main, Aug  2 2025, 15:00:03) [Clang 14.0.3 (clang-1403.0.22.14.1)]
TensorFlow: 2.12.0


## Cell 3: Load MEDQA dataset


In [16]:

df = pd.read_csv(DATA_PATH)
print(f"✅ Loaded dataset with {len(df)} rows.")
print(df.head())


✅ Loaded dataset with 16412 rows.
                                 question  \
0                What is (are) Glaucoma ?   
1                  What causes Glaucoma ?   
2     What are the symptoms of Glaucoma ?   
3  What are the treatments for Glaucoma ?   
4                What is (are) Glaucoma ?   

                                              answer           source  \
0  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   
1  Nearly 2.7 million people have glaucoma, a lea...  NIHSeniorHealth   
2  Symptoms of Glaucoma  Glaucoma can develop in ...  NIHSeniorHealth   
3  Although open-angle glaucoma cannot be cured, ...  NIHSeniorHealth   
4  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   

  focus_area  
0   Glaucoma  
1   Glaucoma  
2   Glaucoma  
3   Glaucoma  
4   Glaucoma  


## Cell 4: Clean and Reformat Dataset


In [17]:

df = df.dropna(subset=["question", "answer"])

# Strip whitespace
df["question"] = df["question"].astype(str).str.strip()
df["answer"] = df["answer"].astype(str).str.strip()

# Reformat into T5 input/output pairs
df["input_text"] = "question: " + df["question"]
df["target_text"] = df["answer"]

print("✅ Cleaned and formatted dataset sample:")
print(df[["input_text", "target_text"]].head())


✅ Cleaned and formatted dataset sample:
                                         input_text  \
0                question: What is (are) Glaucoma ?   
1                  question: What causes Glaucoma ?   
2     question: What are the symptoms of Glaucoma ?   
3  question: What are the treatments for Glaucoma ?   
4                question: What is (are) Glaucoma ?   

                                         target_text  
0  Glaucoma is a group of diseases that can damag...  
1  Nearly 2.7 million people have glaucoma, a lea...  
2  Symptoms of Glaucoma  Glaucoma can develop in ...  
3  Although open-angle glaucoma cannot be cured, ...  
4  Glaucoma is a group of diseases that can damag...  


## Cell 5: Split into Train and Test Sets


In [18]:

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

print(f"✅ Training samples: {len(train_df)}")
print(f"✅ Testing samples: {len(test_df)}")



✅ Training samples: 14766
✅ Testing samples: 1641


## Cell 6: Initialize Tokenizer


In [19]:

tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
print("✅ Tokenizer loaded.")


✅ Tokenizer loaded.


## Cell 7: Tokenization Function


In [20]:

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )

    labels = tokenizer(
        examples["target_text"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("✅ Preprocessing function ready.")


✅ Preprocessing function ready.


## Cell 8: Convert to Hugging Face Dataset Format


In [21]:

train_dataset = HFDataset.from_pandas(train_df)
test_dataset = HFDataset.from_pandas(test_df)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

print("✅ Tokenization completed successfully.")


                                                                   

✅ Tokenization completed successfully.




## Convert to TensorFlow Datasets

In [29]:
from transformers import DataCollatorForSeq2Seq

# Data collator handles dynamic padding and label alignment automatically
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

# Convert tokenized datasets to TensorFlow datasets
train_dataset_tf = model.prepare_tf_dataset(
    tokenized_train,
    collate_fn=data_collator,
    shuffle=True,
    batch_size=8,
)

test_dataset_tf = model.prepare_tf_dataset(
    tokenized_test,
    collate_fn=data_collator,
    shuffle=False,
    batch_size=8,
)

print("✅ TensorFlow datasets ready for model training!")


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ TensorFlow datasets ready for model training!


## Cell 9: Import Training Components


In [30]:

from transformers import TFT5ForConditionalGeneration, create_optimizer
import math

MODEL_PATH = "t5-small"  
EPOCHS = 3
BATCH_SIZE = 8
LEARNING_RATE = 5e-5

# Load pre-trained model
model = TFT5ForConditionalGeneration.from_pretrained(MODEL_PATH)
print("✅ Model loaded successfully.")


All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


✅ Model loaded successfully.


## Cell 10: Create Optimizer and Compile Model


In [31]:

steps_per_epoch = len(train_df) // BATCH_SIZE
num_train_steps = steps_per_epoch * EPOCHS

optimizer, schedule = create_optimizer(
    init_lr=LEARNING_RATE,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
)

model.compile(optimizer=optimizer)
print("✅ Model compiled and ready for training.")


✅ Model compiled and ready for training.


## Cell 11: Train the Model


In [33]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Training parameters
EPOCHS = 3
BATCH_SIZE = 8
LEARNING_RATE = 5e-5
checkpoint_path = "./models/t5_medqa_finetuned_best"

# Callbacks
early_stopping = EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True
)

model_checkpoint = ModelCheckpoint(
    checkpoint_path,
    monitor="val_loss",
    save_best_only=True,
    save_weights_only=True
)

# Optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

# Compile without setting loss manually
model.compile(optimizer=optimizer)

# Train
history = model.fit(
    train_dataset_tf,
    validation_data=test_dataset_tf,
    epochs=EPOCHS,
    callbacks=[early_stopping, model_checkpoint]
)

print("✅ Training completed successfully. Best model saved at:", checkpoint_path)


Epoch 1/3
Epoch 2/3
Epoch 3/3
✅ Training completed successfully. Best model saved at: ./models/t5_medqa_finetuned_best


## Save the model

In [34]:
# Step: Save fine-tuned model and tokenizer

SAVE_DIR = "./models/t5_finetuned_helpdesk_v2"  # choose any folder name you like

# ✅ Create the directory if it doesn't exist
import os
os.makedirs(SAVE_DIR, exist_ok=True)

# ✅ Save the model and tokenizer
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print(f"✅ Model and tokenizer successfully saved to: {SAVE_DIR}")


✅ Model and tokenizer successfully saved to: ./models/t5_finetuned_helpdesk_v2
