In [1]:
# from google.colab import drive
# drive.mount('/content/drive')  # Add My Drive/<>

# import os
# os.chdir('drive/My Drive/SCU/Fourth Quarter/NLP/Notebooks')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data Preprocessing

In [1]:
import pandas as pd
import re
import string

# === 1. Load the dataset ===
file_path = "datasets/mle_screening_dataset.csv"
df = pd.read_csv(file_path)

# === 2. Drop duplicates and nulls ===
df.drop_duplicates(inplace=True)
df.dropna(subset=["question", "answer"], inplace=True)

# # === 3. Clean the text ===
def clean_text(text):
    text = str(text)
    text = text.strip() 
    text = text.lower()  

    # Remove HTML tags or brackets
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"\[.*?\]", "", text)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()

df["question"] = df["question"].apply(clean_text)
df["answer"] = df["answer"].apply(clean_text)

# === 4. Optional: Combine similar Qs (if multiple answers for one Q) ===
grouped_df = df.groupby("question")["answer"].apply(lambda x: " ".join(set(x))).reset_index()

# === 5. Filter out too-short or too-long entries ===
grouped_df = grouped_df[grouped_df["answer"].str.split().str.len().between(10, 150)]

# === 6. Save processed data ===
grouped_df.to_csv("processed_medical_qa.csv", index=False)

print("✅ Data processing complete. Final shape:", grouped_df.shape)


✅ Data processing complete. Final shape: (7186, 2)


## Model Training

In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import evaluate
import numpy as np
import torch


# STEP 1: Load dataset

df = grouped_df
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_df_sample = train_df.sample(n=500, random_state=42)
val_df_sample = val_df.sample(n=100, random_state=42)
test_df_sample = test_df.sample(n=100, random_state=42)


dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df_sample.reset_index(drop=True)),
    'validation': Dataset.from_pandas(val_df_sample.reset_index(drop=True)),
    'test': Dataset.from_pandas(test_df_sample.reset_index(drop=True)),
})



# STEP 2: Load Tokenizer and Model

model_name = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


# STEP 3: Preprocessing Function

MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128



You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
def preprocess(batch):
    input_texts = ["question: " + q for q in batch["question"]]
    target_texts = batch["answer"]

    model_inputs = tokenizer(
        input_texts,
        max_length=MAX_INPUT_LENGTH,
        padding="max_length",
        truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            target_texts,
            max_length=MAX_TARGET_LENGTH,
            padding="max_length",
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
tokenized_datasets = dataset.map(
    preprocess,
    batched=True,
    remove_columns=["question", "answer"]  # remove index if present
)


Map: 100%|██████████| 500/500 [00:00<00:00, 766.06 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1418.02 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1667.75 examples/s]


In [19]:

# STEP 4: Training Arguments

training_args = TrainingArguments(
    output_dir="./flan-t5-medical-qa",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=4,     
    per_device_eval_batch_size=2,     
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=False,
    metric_for_best_model=None,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)



# STEP 5: Evaluation Metrics

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
exact_match = evaluate.load("exact_match")

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Exact Match
    em_score = exact_match.compute(predictions=decoded_preds, references=decoded_labels)["exact_match"]

    # ROUGE
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # BLEU
    bleu_result = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])

    return {
        "exact_match": em_score,
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_result["bleu"]
    }


# STEP 6: Train Model

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  


trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.9647,2.111054
2,2.0617,1.953027


TrainOutput(global_step=250, training_loss=3.134397277832031, metrics={'train_runtime': 2360.2467, 'train_samples_per_second': 0.424, 'train_steps_per_second': 0.106, 'total_flos': 46472626176000.0, 'train_loss': 3.134397277832031, 'epoch': 2.0})

## Model Evaluation

In [20]:

test_results = trainer.evaluate(tokenized_datasets["test"])
print("\n📊 Final Test Results:", test_results)


📊 Final Test Results: {'eval_loss': 1.8731110095977783, 'eval_runtime': 41.6072, 'eval_samples_per_second': 2.403, 'eval_steps_per_second': 1.202, 'epoch': 2.0}


In [21]:
def predict_sample(idx=0):
    input_text = "question: " + test_df_sample.iloc[idx]["question"]
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(model.device)
    output_ids = model.generate(**inputs, max_new_tokens=128)
    print("User Question:", test_df_sample.iloc[idx]["question"])
    print("Expected Answer:", test_df_sample.iloc[idx]["answer"])
    print("Model Answer:", tokenizer.decode(output_ids[0], skip_special_tokens=True))

predict_sample(0)
predict_sample(1)
predict_sample(2)


User Question: is retroperitoneal fibrosis inherited
Expected Answer: most cases of retroperitoneal fibrosis are sporadic which means that they occur in people with no apparent history of the disorder in their family in rare cases the condition has been reported to occur in a few members of the same family but the inheritance pattern is unknown
Model Answer: retroperitoneal fibrosis is inherited in the family a inherited condition is inherited in the family a inherited condition is inherited in the family a inherited condition is inherited in the family a inherited condition is inherited in the family a inherited condition is inherited in the family a inherited condition is inherited in the family a inherited condition is inherited in the family a inherited condition is inherited in the family a inherited condition is inherited in the family a inherited condition is inherited in the family a inherited
User Question: what are the treatments for fragile xassociated tremorataxia syndrome


## Testing saved model

In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Replace with the actual path where you saved the model
model_path = "./flan-t5-qa-final"

tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
def predict_sample(idx=0):
    input_text = "question: " + train_df_sample.iloc[idx]["question"]
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(model.device)
    output_ids = model.generate(**inputs, max_new_tokens=128)
    print("User Question:", train_df_sample.iloc[idx]["question"])
    print("Expected Answer:", train_df_sample.iloc[idx]["answer"])
    print("Model Answer:", tokenizer.decode(output_ids[0], skip_special_tokens=True))

predict_sample(6)
predict_sample(4)
predict_sample(5)


User Question: what are the genetic changes related to pyridoxinedependent epilepsy
Expected Answer: mutations in the aldh7a1 gene cause pyridoxinedependent epilepsy the aldh7a1 gene provides instructions for making an enzyme called aminoadipic semialdehyde aasa dehydrogenase also known as antiquitin this enzyme is involved in the breakdown of the protein building block amino acid lysine in the brain when antiquitin is deficient a molecule that interferes with vitamin b6 function builds up in various tissues pyridoxine plays a role in many processes in the body such as the breakdown of amino acids and the productions of chemicals that transmit signals in the brain neurotransmitters it is unclear how a lack of pyridoxine causes the seizures that are characteristic of this condition some individuals with pyridoxinedependent epilepsy do not have identified mutations in the aldh7a1 gene in these cases the cause of the condition is unknown
Model Answer: pyridoxine
User Question: what are th