Use Microsoft Copilot to annotate the dataset for answers to each of the 4 questions that will be asked so there is data to finetune the model with. The following was a prompt used to gather the annotated data:

#### Use this file and find the symptoms of the patient that are described in the transcription column. Report the words related to the symptoms in a new column called "symptoms_true". FIND SYMPTOMS IN AS MANY RECORDS AS POSSIBLE. If no complaints found in the transcript, report "no symptoms listed"

In [None]:
!#pip install transformers datasets torch evaluate

In [None]:
import torch
import pandas as pd
import evaluate
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments

In [None]:
print(f"CUDA available: {torch.cuda.is_available()}")

CUDA available: True


In [None]:
# Load dataset
data_path = "/content/drive/MyDrive/UNH Data Science Courses/DSCI 6004 - NLP/Bonus Assignment/updated_transcription.csv"
data_path= "/content/updated_transcription.csv"
data = pd.read_csv(data_path)

In [None]:
# Define the questions and map them to the respective columns
questions = {
    "How old is the patient?": "age_true",
    "Does the patient have any complaints?": "complaints_true",
    "What is the reason for this consultation?": "reason_true",
    "What other symptoms does the patient have?": "symptoms_true",
}

# Create a list to store the reformatted data
qa_data = []

# Iterate over each row to structure the data
for _, row in data.iterrows():
    context = row["transcription"]
    for question, answer_col in questions.items():
        qa_data.append({
            "context": context,
            "question": question,
            "answer": row[answer_col]
        })

# Convert to a Hugging Face Dataset format
qa_df = pd.DataFrame(qa_data)
train_data, eval_data = train_test_split(qa_df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)

In [None]:
# Initialize the tokenizer for deepset/tinyroberta-squad2
model_name = "deepset/tinyroberta-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    questions = examples["question"]
    contexts = examples["context"]
    answers = examples["answer"]

    # Tokenize the questions and contexts
    inputs = tokenizer(questions, contexts, max_length=512, truncation=True, padding="max_length")

    # Find the start and end positions of the answer in the context
    start_positions = []
    end_positions = []
    for i, context in enumerate(contexts):
        answer = answers[i]

        # Find the start and end indices of the answer in the context
        start_idx = context.find(answer)
        if start_idx != -1:
            end_idx = start_idx + len(answer) - 1
            # Tokenize the answer to get token indices
            start_position = tokenizer.encode(context[:start_idx], add_special_tokens=False)
            end_position = tokenizer.encode(context[:end_idx + 1], add_special_tokens=False)

            start_positions.append(len(start_position))  # Start position is the length of the tokens before the answer
            end_positions.append(len(end_position) - 1)  # End position is the length of tokens before the end of the answer
        else:
            start_positions.append(0)  # Default to 0 if the answer is not found
            end_positions.append(0)  # Default to 0 if the answer is not found

    # Add the positions to the inputs
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

# Preprocess the datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/15891 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1034 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/3973 [00:00<?, ? examples/s]

In [None]:
# Load the evaluation metrics (F1 and Exact Match)
metric = evaluate.load("squad")

def compute_metrics(eval_pred):
    # Unpack the tuple containing logits and labels
    logits, labels = eval_pred

    # Extract start and end logits and labels
    start_logits, end_logits = logits
    start_labels, end_labels = labels

    # Ensure the logits are PyTorch tensors (if they are not)
    start_logits = torch.tensor(start_logits) if not isinstance(start_logits, torch.Tensor) else start_logits
    end_logits = torch.tensor(end_logits) if not isinstance(end_logits, torch.Tensor) else end_logits

    # Convert logits to predictions (argmax)
    start_preds = torch.argmax(start_logits, dim=-1).numpy()  # Convert to numpy
    end_preds = torch.argmax(end_logits, dim=-1).numpy()      # Convert to numpy

    # Create span predictions by combining start and end predictions
    pred_spans = [{"id": str(i), "prediction_text": f"Start: {start} End: {end}"} for i, (start, end) in enumerate(zip(start_preds, end_preds))]

    # Create the references with text and answer_start
    true_spans = [{"id": str(i), "answers": [{"text": f"Start: {start} End: {end}", "answer_start": start}]} for i, (start, end) in enumerate(zip(start_labels, end_labels))]

    # Compute results for the full span (start and end combined)
    results = metric.compute(predictions=pred_spans, references=true_spans)

    return results

In [None]:
# Load the pre-trained model
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to('cuda')

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory for saving the model
    evaluation_strategy="epoch",     # Evaluate after each epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=64,   # Batch size for training
    per_device_eval_batch_size=64,    # Batch size for evaluation
    num_train_epochs=5,              # Number of training epochs
    weight_decay=0.01,               # Weight decay
    push_to_hub=False,               # Don't push to Hugging Face Hub
    fp16=True,                       # Enable mixed precision for faster training
    report_to="none"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Exact Match,F1
1,No log,1.10365,75.358671,89.164359
2,No log,0.802555,77.019884,90.89479
3,1.394500,0.607937,82.230053,92.530833
4,1.394500,0.526496,83.916436,93.355147
5,0.687900,0.498918,84.218475,93.487289


TrainOutput(global_step=1245, training_loss=0.9440525162172126, metrics={'train_runtime': 1125.55, 'train_samples_per_second': 70.592, 'train_steps_per_second': 1.106, 'total_flos': 1.038104206304256e+16, 'train_loss': 0.9440525162172126, 'epoch': 5.0})

In [None]:
# Save the fine-tuned model
model.save_pretrained("fine_tuned_medical_qa_model")
tokenizer.save_pretrained("fine_tuned_medical_qa_model")

('fine_tuned_medical_qa_model/tokenizer_config.json',
 'fine_tuned_medical_qa_model/special_tokens_map.json',
 'fine_tuned_medical_qa_model/vocab.json',
 'fine_tuned_medical_qa_model/merges.txt',
 'fine_tuned_medical_qa_model/added_tokens.json',
 'fine_tuned_medical_qa_model/tokenizer.json')

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation results
print(eval_results)

{'eval_loss': 0.498918354511261, 'eval_exact_match': 84.21847470425371, 'eval_f1': 93.48728920211427, 'eval_runtime': 18.0129, 'eval_samples_per_second': 220.564, 'eval_steps_per_second': 3.497, 'epoch': 5.0}
