# Doctor Response Generation GPT-2

This file explores the main goal of the project, generating a text response from a patient query.

Model: https://huggingface.co/openai-community/gpt2

In [32]:
# gather all imports

from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2TokenizerFast, GPT2LMHeadModel, AdamW, get_scheduler
import torch
from torch.utils.data import Dataset, DataLoader
from torch.amp import autocast, GradScaler
import json
import numpy as np
from tqdm import tqdm
import json
import itertools
from datetime import datetime
import os
import platform
import subprocess

As before, lets load in the data.

In [33]:
def load_and_prepare(file_path):
    """
    Helper function to load in the data into a specific form 

    @PARAMS:
        - file_path -> the file to process
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        # format data to be just question answer pairs
        formatted_data = []
        for entry in data:
            formatted_data.append({
                "question": entry["Question"],
                "response": entry["Answer"]
            })
        
        print(f"Loaded {len(formatted_data)} Q&A pairs from {file_path}!")
        return formatted_data
        
    except Exception as e:
        print(f"Error loading in file...\n{e}")
        return []

In [34]:
# load in formatted data
## TRAIN ##
train_data = load_and_prepare("processed_data/train.json")

## VAL ##
val_data = load_and_prepare("processed_data/validation.json")

## TEST ##
test_data = load_and_prepare("processed_data/test.json")

# print out one value of each to make sure it is loaded correctly
print(train_data[0])
print(val_data[0])
print(test_data[0])

Loaded 18749 Q&A pairs from processed_data/train.json!
Loaded 2344 Q&A pairs from processed_data/validation.json!
Loaded 2344 Q&A pairs from processed_data/test.json!
{'question': 'will eating late evening meals increase my cholesterol?', 'response': 'no. it is what you are eating (as well as your genetics) not when you eat it. it depends on the kinds of foods that you eat. make sure that you are eating healthy foods in order to not gain great amount of cholesterol. you have to always watch what you eat in order to have a healthy skin and body. you may check out www. clearclinic. com for great ideas to achieve an acne free skin.'}
{'question': 'who is affected by arthritis?', 'response': 'arthritis sufferers include men and women children and adults. approximately 350 million people worldwide have arthritis. nearly 40 million people in the united states are affected by arthritis including over a quarter million children! more than 27 million americans have osteoarthritis. approximately

Now lets build the GPT text gen model!

In [35]:
class GPTBaseline:
    """
    Simple baseline GPT-2 model without fine-tuning.
    Used to see how well it can respond to medical queries "out of the box"
    """
    def __init__(self, model_name="gpt2"):
        """
        Initializer function to establish the gpt model

        @PARAMS:
            - model_name -> which model to initialize
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # load model and tokenizer
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name).to(self.device)
        
        # set pad token to eos token (needed for GPT-2)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
    def generate_response(self, question, max_length=200):
        """
        Generates a response off the baseline gpt model.

        @PARAMS:
            - question   -> user query
            - max_length -> response maximum length
        """
        # get prompt formatting
        prompt = f"Question: {question}\nAnswer:"
        
        # tokenize the input
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            padding=True
        ).to(self.device)
        
        # get the output with specific parameters
        outputs = self.model.generate(
            inputs["input_ids"],
            max_length=max_length,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            no_repeat_ngram_size=3,
            early_stopping=True,
            pad_token_id=self.tokenizer.eos_token_id,
            eos_token_id=self.tokenizer.eos_token_id
        )
        
        # decode the response
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # clean the response and return it
        try:
            response = response.split("Answer:")[-1].strip()

            # remove any continuation
            if "Question:" in response:
                response = response.split("Question:")[0].strip()
            if "Q:" in response:
                response = response.split("Q:")[0].strip()
        except:
            response = "Error generating response."
        # return response
        return response
    

Now lets test the model on a few inputs.

In [39]:
# Initialize model
baseline = GPTBaseline()

# lets get 5 example questions to run on from the test set
test_questions = [point['question'] for point in test_data]
test_responses = [point['response'] for point in test_data]

print("Generating Medical Responses:\n")
with open('gpt2_baseline_results.txt', "w", encoding='utf-8') as f:
    for question,response in zip(test_questions, test_responses):
        model_response = baseline.generate_response(question)
        f.write(f"Question: {question}\n")
        f.write(f"Baseline Response: {model_response}\n")
        f.write(f"\nExpected Response: {response}\n")
        # response seperator
        f.write("=" * 80 + "\n")

Generating Medical Responses:



Okay sweet we are getting results! However, they are not what we are looking for - at all. In fact, reading through some of these are pretty funny as they are just nonsense. For example when asked

*who manufactures actos? my mother can’t afford it & sometimes skips her dose. do they offer an assistance program?*

the first sentence in the response is: there are many, many products available that will help you find the right product.

Clearly there is some work to do. The goal now is to fine-tune off the training data to get more doctor-like responses. The goal isn't to replace a doctor, rather develop a model that can speak like one and hopefully give medically accurate advice.

## Fine-Tuning GPT-2

Now we will fine-tune the model based on the training data.

In [8]:
# make sure we have CUDA enabled!
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.5.1+cu118
CUDA available: True
CUDA device: NVIDIA GeForce RTX 4070 Laptop GPU


In [None]:
# aid from https://pytorch.org/docs/stable/torch.html

class GPTFineTuned:
    """Class to define the Fine-Tuned version of GPT-2."""
    def __init__(self, model_name="gpt2"):
        """
        Initializer function to make a GPT-2 model that will be fine-tuned.

        @PARAMS:
            - model_name -> which model to start with
        """
        # make sure to run on CUDA since this will be computationally expensive
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # load tokenizer and model
        self.tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name).to(self.device)
        
        # set the pad token
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model.config.pad_token_id = self.tokenizer.eos_token_id
        
        # give a summary of the model size 
        model_size = sum(p.numel() for p in self.model.parameters()) / 1000000
        print(f"Model size: {model_size:.2f}M parameters")

    def prepare_data(self, conversations, batch_size, is_training=True):
        """
        Function to read in the conversation data and convert it to a question/answer format.

        @PARAMS:
            - conversations -> the patient doctor conversation
            - batch_size    -> parameter for the dataloader 
            - is_training   -> parameter used for shuffling the data to preserve randomness
        """
        # re-format the conversations
        texts = [f"Question: {conv['question']}\nAnswer: {conv['response']}" 
                for conv in conversations]
        
        # tokenize the texts
        encodings = self.tokenizer(
            texts,
            truncation=True,
            padding='max_length',
            max_length=256,
            return_tensors="pt"
        )
        
        # necessary formatting
        input_ids = encodings['input_ids']
        attention_masks = encodings['attention_mask']
        
        # create the dataset in a dictionary format
        dataset_dict = {
            'input_ids': input_ids,
            'attention_mask': attention_masks,
            'labels': input_ids.clone()
        }
        
        # load the dictionary data into the correct format
        data_loader = DataLoader(
            TensorDictDataset(dataset_dict),
            batch_size=batch_size,
            shuffle=is_training
        )
        
        return data_loader
    
    def train_model(self, num_epochs=3, learning_rate=5e-5, warmup_ratio=0.1, save_path=None, gradient_accumulation_steps=2, batch_size=8):
        """ 
        MAIN TRAINING FUNCTION.
        
        @PARAMS:
            - [num_epochs, learning_rate, warmup_ratio, save_path, gradient_accumulation_steps, batch_size] -> hyperparameters to tune for better training results
        """
        print("Starting training...")
        print(f"Batch size: {batch_size}")
        print(f"Gradient accumulation steps: {gradient_accumulation_steps}")
        print(f"Effective batch size: {batch_size * gradient_accumulation_steps}")
        
        # aid from https://pytorch.org/docs/stable/notes/amp_examples.html use gradiant scaler
        # initialize gradient scaler for faster training while preserving accuracy
        scaler = GradScaler()
        
        # optimizer with weight decay 
        optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        
        # scheduler to adjust learning rate
        num_training_steps = len(self.train_loader) * num_epochs
        num_warmup_steps = int(num_training_steps * warmup_ratio)
        scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )
        
        # values to store intermediate steps
        best_val_loss = float('inf')
        total_steps = 0
        
        # now run each epoch - one full pass through the data each!
        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch + 1}/{num_epochs}")
            
            # reset values and start training
            self.model.train()
            train_loss = 0
            optimizer.zero_grad()
            
            # update terminal output
            progress_bar = tqdm(enumerate(self.train_loader), total=len(self.train_loader))
            
            # run the batch on the GPU! I attempted this on CPU and took forever (as expected) so dramatically speed up with batch processing!
            for batch_idx, batch in progress_bar:
                # move the batch to the GPU - this will dramatically save time!
                batch = {k: v.to(self.device) for k, v in batch.items()}
                
                # forward pass on GPU
                with autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu'):
                    outputs = self.model(**batch)
                    loss = outputs.loss / gradient_accumulation_steps
                
                # backward pass
                scaler.scale(loss).backward()
                train_loss += loss.item() * gradient_accumulation_steps
                
                # update the weights, clip gradients, update learning rate!
                if (batch_idx + 1) % gradient_accumulation_steps == 0:
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                    scaler.step(optimizer)
                    scaler.update()
                    scheduler.step()
                    optimizer.zero_grad()
                    total_steps += 1
                
                # update terminal output
                progress_bar.set_description(f"Loss: {train_loss/(batch_idx+1):.4f}")
                
                # save a checkpoint every 1000 steps in case of crashes
                if total_steps > 0 and total_steps % 1000 == 0:
                    self.model.save_pretrained(f"{save_path}_step_{total_steps}")
                    print(f"\nCheckpoint saved at step {total_steps}")
            
            # update the training loss
            avg_train_loss = train_loss / len(self.train_loader)
            
            # now we need to evaluate the model!
            self.model.eval()
            val_loss = 0
            
            # now run the validation data through the model
            print("\nRunning validation...")
            with torch.no_grad():
                for batch in tqdm(self.val_loader):
                    batch = {k: v.to(self.device) for k, v in batch.items()}
                    outputs = self.model(**batch)
                    val_loss += outputs.loss.item()
            
            # get the valdidation loss
            avg_val_loss = val_loss / len(self.val_loader)
            
            print(f"Average training loss: {avg_train_loss:.4f}")
            print(f"Average validation loss: {avg_val_loss:.4f}")
            
            # save the model!!! but we really only care if it performed better than the previous, so only save if it is the best one seen (so far)
            if save_path and avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                self.model.save_pretrained(save_path)
                self.tokenizer.save_pretrained(save_path)
                print(f"Best model saved to {save_path}")

    def generate_response(self, text, max_length=200):
        """
        Function to return model output based on a user query!

        @PARAMS:
            - text       -> query to run off of
            - max_length -> response limit
        """
        self.model.eval()
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            padding=True
        ).to(self.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                inputs["input_ids"],
                max_length=max_length,
                num_return_sequences=1,
                do_sample=True,
                # we don't want the model to be too creative as this is usually info that needs to be accurate, therefore make the temperature medium level for expected tokens more likely
                temperature=0.5,
                top_p=0.9,
                no_repeat_ngram_size=3,
                early_stopping=True,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )
        
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

class TensorDictDataset(Dataset):
    """REQUIRED CLASS for pytorch's DataLoader, which requires both a __getitem__ and __len__ function."""
    def __init__(self, tensor_dict):
        self.tensor_dict = tensor_dict

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tensor_dict.items()}

    def __len__(self):
        return len(next(iter(self.tensor_dict.values())))

class MedicalGPT(GPTFineTuned):
    def prepare_separate_datasets(self, train_data, val_data, batch_size=8):
        """
        Prepare separate training and validation datasets
        """
        print(f"Processing {len(train_data)} training examples...")
        self.train_loader = self.prepare_data(
            train_data, 
            batch_size=batch_size,
            is_training=True
        )
        
        print(f"Processing {len(val_data)} validation examples...")
        self.val_loader = self.prepare_data(
            val_data, 
            batch_size=batch_size,
            is_training=False
        )
        
        print("Data preparation complete!")
        return self.train_loader, self.val_loader
    
    def generate_medical_response(self, question, max_length=200):
        """
        Generate a medical response with improved parameters for better quality
        
        @PARAMS:
            - question   -> user query
            - max_length -> maximum length of response
        """
        self.model.eval()
        prompt = f"Question: {question}\nAnswer:"
        
        # Tokenize input
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            padding=True
        ).to(self.device)
        
        # Generate with tuned parameters
        with torch.no_grad():
            outputs = self.model.generate(
                inputs["input_ids"],
                max_length=max_length,
                num_return_sequences=1,
                do_sample=True,
                # focus the responses - make the next token be one that is more likely. We don't want the model to be too creative with this task.
                temperature=0.3,
                top_p=0.85,
                top_k=50,
                no_repeat_ngram_size=4,
                min_length=50,
                repetition_penalty=1.2,
                early_stopping=True,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )
        
        # Decode and clean the response
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Improved response cleaning
        try:
            # Remove the original question and prompt
            response = response.split("Answer:")[-1].strip()
            
            # Remove any repeated questions
            if "Question:" in response:
                response = response.split("Question:")[0].strip()
            
            # Remove any Q: format questions
            if "Q:" in response:
                response = response.split("Q:")[0].strip()
            
            # Clean up repetitive phrases
            response = self.clean_repetitive_text(response)
            
        except Exception as e:
            response = "I apologize, but I couldn't generate a proper medical response. Please consult a healthcare professional for medical advice."
        
        return response

    def clean_repetitive_text(self, text):
        """
        Clean up repetitive phrases and improve formatting
        """
        # Split into sentences
        sentences = text.split('.')
        
        # Remove duplicate sentences
        seen_sentences = set()
        cleaned_sentences = []
        
        for sentence in sentences:
            sentence = sentence.strip().lower()
            if sentence and sentence not in seen_sentences:
                seen_sentences.add(sentence)
                cleaned_sentences.append(sentence)
        
        # Capitalize first letters and rejoin
        cleaned_text = '. '.join(s.capitalize() for s in cleaned_sentences if s)
        
        # Add final period if missing
        if cleaned_text and not cleaned_text.endswith('.'):
            cleaned_text += '.'
            
        return cleaned_text

In [None]:
# prevent the computer from sleeping when running!
try:
    if platform.system() == 'Windows':
        # Windows command to prevent sleep
        subprocess.Popen(['powercfg', '-change', '-standby-timeout-ac', '0'])
        subprocess.Popen(['powercfg', '-change', '-monitor-timeout-ac', '0'])
        print("Sleep prevention activated for Windows")
    elif platform.system() == 'Darwin':  # macOS
        subprocess.Popen(['caffeinate', '-i'])
        print("Sleep prevention activated for macOS")
    elif platform.system() == 'Linux':
        subprocess.Popen(['systemctl', 'mask', 'sleep.target', 'suspend.target', 
                        'hibernate.target', 'hybrid-sleep.target'])
        print("Sleep prevention activated for Linux")
except Exception as e:
    print(f"Warning: Could not prevent sleep mode: {e}")

# lets run the model and get some responses!
model = MedicalGPT()

# prepare the data for the required format
train_loader, val_loader = model.prepare_separate_datasets(
    train_data,
    val_data,
    batch_size=4
)

# train the model, with a gpu this will take roughly 22 minutes
model.train_model(
    num_epochs=3,
    learning_rate=5e-5,
    save_path="./medical_gpt2_model",
    gradient_accumulation_steps=4,
    batch_size=4
)

# test with an arbitrary input
question = "What can I do to lower my blood pressure naturally?"
response = model.generate_medical_response(question)
print(response)

Sleep prevention activated for Windows
Model size: 124.44M parameters
Processing 18749 training examples...
Processing 2344 validation examples...




Data preparation complete!
Starting training...
Batch size: 4
Gradient accumulation steps: 4
Effective batch size: 16

Epoch 1/3


Loss: 2.1611:  85%|████████▌ | 4000/4688 [06:17<02:13,  5.16it/s]


Checkpoint saved at step 1000


Loss: 2.1612:  85%|████████▌ | 4001/4688 [06:17<03:20,  3.43it/s]


Checkpoint saved at step 1000


Loss: 2.1611:  85%|████████▌ | 4002/4688 [06:18<04:24,  2.60it/s]


Checkpoint saved at step 1000


Loss: 2.1611:  85%|████████▌ | 4004/4688 [06:19<04:21,  2.61it/s]


Checkpoint saved at step 1000


Loss: 2.1131: 100%|██████████| 4688/4688 [07:23<00:00, 10.57it/s]



Running validation...


100%|██████████| 586/586 [00:30<00:00, 19.04it/s]


Average training loss: 2.1131
Average validation loss: 1.7759
Best model saved to ./medical_gpt2_model

Epoch 2/3


Loss: 1.7930:  71%|███████   | 3311/4688 [05:11<02:06, 10.89it/s]


Checkpoint saved at step 2000


Loss: 1.7927:  71%|███████   | 3313/4688 [05:12<06:44,  3.40it/s]


Checkpoint saved at step 2000


Loss: 1.7929:  71%|███████   | 3314/4688 [05:13<08:41,  2.64it/s]


Checkpoint saved at step 2000


Loss: 1.7928:  71%|███████   | 3316/4688 [05:14<08:38,  2.65it/s]


Checkpoint saved at step 2000


Loss: 1.7893: 100%|██████████| 4688/4688 [07:22<00:00, 10.59it/s]



Running validation...


100%|██████████| 586/586 [00:30<00:00, 19.01it/s]


Average training loss: 1.7893
Average validation loss: 1.7106
Best model saved to ./medical_gpt2_model

Epoch 3/3


Loss: 1.7216:  56%|█████▌    | 2624/4688 [04:06<06:45,  5.09it/s]


Checkpoint saved at step 3000


Loss: 1.7216:  56%|█████▌    | 2625/4688 [04:07<10:01,  3.43it/s]


Checkpoint saved at step 3000


Loss: 1.7216:  56%|█████▌    | 2626/4688 [04:08<13:06,  2.62it/s]


Checkpoint saved at step 3000


Loss: 1.7213:  56%|█████▌    | 2628/4688 [04:09<12:52,  2.67it/s]


Checkpoint saved at step 3000


Loss: 1.7086: 100%|██████████| 4688/4688 [07:22<00:00, 10.60it/s]



Running validation...


100%|██████████| 586/586 [00:30<00:00, 19.00it/s]


Average training loss: 1.7086
Average validation loss: 1.6788
Best model saved to ./medical_gpt2_model




you can reduce your blood pressure by: taking a blood pressure medication such as prednisone. using a blood thinning medication such a metronidazole. using an over-the-counter medication such aspirin. taking a medication that contains caffeine. taking an antihistamine such as cetirizine. taking medications that contain a substance called a diuretic. taking antihistamines such as naproxen. taking anticonvulsants such as metroniramate. taking metronitrate. taking diuretics such as ibuprofen.


In [31]:
# load the model!
model = MedicalGPT()
model.model = GPT2LMHeadModel.from_pretrained("./medical_gpt2_model").to(model.device)
model.tokenizer = GPT2TokenizerFast.from_pretrained("./medical_gpt2_model")

# get all testing data results:
questions = [point['question'] for point in test_data]
responses = [point['response'] for point in test_data]


# Generate responses
print("Generating Medical Responses:\n")
with open('gpt2_fine_tuned_results.txt', "w", encoding='utf-8') as f:
    for question,response in zip(questions, responses):
        model_response = model.generate_response(question)
        f.write(f"Question: {model_response}")
        f.write(f"\nExpected Response: {response}\n")
        # response seperator
        f.write("=" * 80 + "\n")

Model size: 124.44M parameters
Generating Medical Responses:

