In [15]:
from typing import List, Tuple
from difflib import SequenceMatcher
from nltk import ngrams
from collections import Counter
import re
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

In [11]:
medical_terms = [
    # Diseases and Conditions
    "diabetes", "hypertension", "asthma", "pneumonia", "arthritis", "stroke", "cancer", 
    "migraine", "obesity", "anemia", "depression", "anxiety", "eczema", "allergies", 
    "bronchitis", "sinusitis", "gout", "ulcer", "psoriasis", "hypothyroidism", "hyperthyroidism", 
    "chronic pain", "fibromyalgia", "osteoporosis", "tuberculosis", "chronic kidney disease", 
    "liver disease", "hepatitis", "HIV", "heart disease", "atrial fibrillation", "acid reflux", 
    "gastroesophageal reflux disease (GERD)", "irritable bowel syndrome (IBS)", "constipation", 
    "diarrhea", "celiac disease", "gluten intolerance", "insomnia", "sleep apnea", "anemia", 
    "polycystic ovary syndrome (PCOS)", "endometriosis", "menopause", "erectile dysfunction", 
    "benign prostatic hyperplasia (BPH)", "urinary tract infection (UTI)", "gallstones", 
    "kidney stones", "osteomyelitis", "skin infection", "acne", "vitamin deficiency", 
    "malnutrition", "chronic fatigue syndrome", "common cold", "influenza", "conjunctivitis",
    
    # Symptoms
    "fever", "cough", "shortness of breath", "fatigue", "nausea", "vomiting", "headache", 
    "rash", "dizziness", "chills", "sweating", "palpitations", "chest pain", "abdominal pain", 
    "joint pain", "back pain", "muscle pain", "sore throat", "runny nose", "stuffy nose", 
    "loss of taste", "loss of smell", "itching", "swelling", "weakness", "tingling", 
    "numbness", "burning sensation", "blurred vision", "double vision", "hearing loss", 
    "ringing in the ears", "frequent urination", "painful urination", "blood in urine", 
    "weight loss", "weight gain", "appetite loss", "constipation", "diarrhea", "bloody stools", 
    "hair loss", "skin discoloration", "easy bruising", "frequent infections", "difficulty swallowing", 
    "heartburn", "difficulty sleeping", "night sweats", "anxiety", "depression", "confusion", 
    "memory loss", "slurred speech", "difficulty walking", "seizures", "fainting", 

    # Medications
    "aspirin", "ibuprofen", "acetaminophen", "metformin", "lisinopril", "amlodipine", "atorvastatin", 
    "simvastatin", "omeprazole", "pantoprazole", "ranitidine", "warfarin", "clopidogrel", "losartan", 
    "furosemide", "hydrochlorothiazide", "spironolactone", "prednisone", "dexamethasone", "albuterol", 
    "salbutamol", "insulin", "metoprolol", "propranolol", "carvedilol", "atenolol", "amoxicillin", 
    "cephalexin", "azithromycin", "doxycycline", "ciprofloxacin", "levofloxacin", "trimethoprim", 
    "sulfamethoxazole", "clindamycin", "fluconazole", "ketoconazole", "acyclovir", "valacyclovir", 
    "gabapentin", "pregabalin", "duloxetine", "venlafaxine", "sertraline", "fluoxetine", "paroxetine", 
    "citalopram", "escitalopram", "bupropion", "trazodone", "mirtazapine", "olanzapine", "risperidone", 
    "quetiapine", "aripiprazole", "lithium", "haloperidol", "lorazepam", "alprazolam", "diazepam", 
    "clonazepam", "zolpidem", "eszopiclone", "modafinil", "lactulose", "mirabegron", "oxybutynin", 

    # Procedures and Diagnostics
    "blood test", "urine test", "X-ray", "CT scan", "MRI", "ultrasound", "biopsy", "colonoscopy", 
    "endoscopy", "echocardiogram", "electrocardiogram (ECG)", "stress test", "pulmonary function test", 
    "spirometry", "allergy test", "skin biopsy", "pap smear", "mammogram", "bone density test", 
    "cholesterol test", "blood sugar test", "thyroid test", "liver function test", "kidney function test", 
    "hematocrit", "hemoglobin", "complete blood count (CBC)", "prostate exam", "eye exam", 
    "hearing test", "mental health screening", "vaccination", "flu shot", "COVID-19 test", 

    # Body Parts and General Terms
    "heart", "lungs", "liver", "kidneys", "stomach", "intestines", "brain", "spinal cord", 
    "nerves", "muscles", "bones", "joints", "skin", "eyes", "ears", "nose", "throat", "teeth", 
    "gums", "tongue", "esophagus", "pancreas", "spleen", "bladder", "prostate", "ovaries", 
    "uterus", "cervix", "testicles", "penis", "vagina", "urethra", "blood vessels", "arteries", 
    "veins", "lymph nodes", "lymphatic system", "immune system", "endocrine system", 
    "nervous system", "respiratory system", "digestive system", "reproductive system", 

    # Medical Terms and Concepts
    "diagnosis", "treatment", "prognosis", "symptom", "syndrome", "acute", "chronic", "remission", 
    "relapse", "inflammation", "infection", "immunity", "antibody", "antigen", "vaccine", "therapy", 
    "surgery", "medication", "prescription", "dosage", "side effect", "adverse reaction", "contraindication", 
    "allergy", "anesthesia", "sedation", "rehabilitation", "physical therapy", "occupational therapy", 
    "counseling", "psychotherapy", "diet", "exercise", "hydration", "rest", "smoking cessation", 
    "alcohol abstinence", "monitoring", "follow-up", "screening", "preventive care", "primary care", 
    "specialist", "referral", "emergency", "urgent care", "hospitalization", "outpatient", "clinic"
]

In [22]:
def extract_medical_concepts(text, medical_terms):
    text = text.lower()
    found_concepts = set()
    for term in medical_terms:
        if re.search(r'\b' + re.escape(term) + r'\b', text):
            found_concepts.add(term)
    return found_concepts

def compute_metrics(ground_truth, predicted):
    y_true = [1 if term in ground_truth else 0 for term in medical_terms]
    y_pred = [1 if term in predicted else 0 for term in medical_terms]
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return precision, recall, f1

def medical_concepts(transcript, whisper, medical_terms):
    ground_truth_concepts = extract_medical_concepts(transcript, medical_terms)
    predicted_concepts = extract_medical_concepts(whisper, medical_terms)
    precision, recall, f1 = compute_metrics(ground_truth_concepts, predicted_concepts)
    return precision, recall, f1

In [23]:
def compute_wer(reference: str, hypothesis: str) -> float:
    """
    Compute Word Error Rate (WER).
    """
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    matcher = SequenceMatcher(None, ref_words, hyp_words)
    edit_operations = sum(op[0] != 'equal' for op in matcher.get_opcodes())
    
    wer = edit_operations / len(ref_words) if ref_words else 0
    return wer

def compute_cer(reference: str, hypothesis: str) -> float:
    """
    Compute Character Error Rate (CER).
    """
    matcher = SequenceMatcher(None, reference, hypothesis)
    edit_operations = sum(op[0] != 'equal' for op in matcher.get_opcodes())
    
    cer = edit_operations / len(reference) if reference else 0
    return cer

def compute_ngram_similarity(reference: str, hypothesis: str, n: int) -> float:
    """
    Compute n-gram similarity.
    """
    ref_ngrams = Counter(ngrams(reference.split(), n))
    hyp_ngrams = Counter(ngrams(hypothesis.split(), n))
    
    common = sum((ref_ngrams & hyp_ngrams).values())  # Intersection of n-grams
    total = sum(ref_ngrams.values())
    
    similarity = common / total if total else 0
    return similarity

def analyze_transcripts(reference: str, hypothesis: str):
    """
    Analyze WER, CER, and n-gram similarities between two transcripts.
    """
    wer = compute_wer(reference, hypothesis)
    cer = compute_cer(reference, hypothesis)
    unigram_similarity = compute_ngram_similarity(reference, hypothesis, 1)
    bigram_similarity = compute_ngram_similarity(reference, hypothesis, 2)
    
    print("Word Error Rate (WER):", round(wer, 4))
    print("Character Error Rate (CER):", round(cer, 4))
    print("Unigram Similarity:", round(unigram_similarity, 4))
    print("Bigram Similarity:", round(bigram_similarity, 4))

## Load Data

In [16]:
df = pd.read_csv('./data/LoganReview.csv')
all_ehrs = []
all_notes = []
all_conversations = []
for i in range(5):
    conversation = df['conversation'][i]
    note = df['note'][i]
    ehr = df['ehr'][i]
    all_ehrs.append(ehr)
    all_notes.append(note)
    all_conversations.append(conversation)

In [34]:
all_whispers = []
for i in range(5):
    
    file_path = f"./data/whisper_transcripts/transcript_{i}.txt"
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    all_whispers.append(content)

In [48]:
all_gpts = []
for i in range(5):
    
    file_path = f"./data/whisper_transcripts/transcript_{i}_gpt.txt"
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    all_gpts.append(content)

## Medical concepts Note -> EHR

In [27]:
average_precision = 0
average_recall = 0
average_f1 = 0

for i in range(5):
    note = all_notes[i]
    ehr = all_ehrs[i]
    precision, recall, f1 = medical_concepts(note, ehr, medical_terms)
    print(i)
    print(precision, recall, f1)
    print()
    average_precision += precision
    average_recall += recall
    average_f1 += f1

print(average_precision/5, average_recall/5, average_f1/5)

0
0.5714285714285714 0.5714285714285714 0.5714285714285714

1
0.6153846153846154 0.5 0.5517241379310345

2
0.6363636363636364 0.6363636363636364 0.6363636363636364

3
0.25 0.3333333333333333 0.2857142857142857

4
0.5555555555555556 0.625 0.5882352941176471

0.5257464757464758 0.5332251082251083 0.526693185111035


## Medical concepts Transcript

In [49]:
average_precision = 0
average_recall = 0
average_f1 = 0

for i in [0,2,4]:
    conversation = all_conversations[i].replace("[doctor]", "").replace("[patient]", "")
    transcript = all_whispers[i]
    gpt_transcript = all_gpts[i].replace("Speaker 1", "").replace("Speaker 2", "").replace("Doctor", "").replace("Patient", "").replace("Sophia", "")
    print(i)
    precision, recall, f1 = medical_concepts(conversation, transcript, medical_terms)
    print(precision, recall, f1)
    precision, recall, f1 = medical_concepts(conversation, gpt_transcript, medical_terms)
    print(precision, recall, f1)
    print()
    average_precision += precision
    average_recall += recall
    average_f1 += f1

print(average_precision/3, average_recall/3, average_f1/3)

0
0.75 0.8181818181818182 0.782608695652174
0.9 0.8181818181818182 0.8571428571428571

2
0.8461538461538461 0.7857142857142857 0.8148148148148148
0.9230769230769231 0.8571428571428571 0.8888888888888888

4
0.75 1.0 0.8571428571428571
0.8888888888888888 0.8888888888888888 0.8888888888888888

0.903988603988604 0.8547378547378548 0.8783068783068783


## WER, CER Transcript -> Whisper

In [50]:
for i in [0, 2, 4]:
    conversation = all_conversations[i].replace("[doctor]", "").replace("[patient]", "")
    transcript = all_whispers[i]
    gpt_transcript = all_gpts[i].replace("Speaker 1", "").replace("Speaker 2", "").replace("Doctor", "").replace("Patient", "").replace("Sophia", "")
    print(i)
    analyze_transcripts(conversation, transcript)
    print()
    analyze_transcripts(conversation, gpt_transcript)
    print()

0
Word Error Rate (WER): 0.059
Character Error Rate (CER): 0.0009
Unigram Similarity: 0.4209
Bigram Similarity: 0.127

Word Error Rate (WER): 0.0608
Character Error Rate (CER): 0.0016
Unigram Similarity: 0.2958
Bigram Similarity: 0.1127

2
Word Error Rate (WER): 0.0475
Character Error Rate (CER): 0.0029
Unigram Similarity: 0.4272
Bigram Similarity: 0.1209

Word Error Rate (WER): 0.0867
Character Error Rate (CER): 0.005
Unigram Similarity: 0.3426
Bigram Similarity: 0.1333

4
Word Error Rate (WER): 0.0763
Character Error Rate (CER): 0.0008
Unigram Similarity: 0.423
Bigram Similarity: 0.1167

Word Error Rate (WER): 0.0874
Character Error Rate (CER): 0.0017
Unigram Similarity: 0.2913
Bigram Similarity: 0.0889

