In [1]:
import re
import json

def generate_clinical_note(symptoms, diagnosis, treatment):
    return {
        "Symptoms": symptoms if symptoms else [{"note": "None identified."}],
        "Diagnosis": diagnosis if diagnosis else [{"note": "None identified."}],
        "Treatment": treatment if treatment else [{"note": "None identified."}]
    }

### Keywords dict

In [2]:
SYMPTOM_KEYWORDS = {
    "General": [
        "fever", "chills", "fatigue", "weak", "exhausted", "lethargic",
        "unwell", "weight loss", "weight gain", "sweating", "sore stomach"
    ],
    "Pain‐related": [
        "pain", "ache", "discomfort", "hurt", "hurting",
        "headache", "joint pain", "muscle pain", "back pain", "chest tightness"
    ],
    "Respiratory": [
        "cough", "shortness of breath", "wheezing",
        "congestion", "runny nose", "sneezing", "phlegm"
    ],
    "Gastrointestinal": [
        "nausea", "vomiting", "diarrhea", "constipation",
        "bloating", "cramping", "heartburn", "loss of appetite"
    ],
    "Neurological": [
        "dizzy", "confusion", "fainting", "seizure",
        "tremor", "numbness", "tingling", "vision problems", "hearing loss", "stiffness"
    ],
    "Dermatological": [
        "rash", "itching", "redness", "bumps", "blisters", "swelling"
    ]
}

DIAGNOSIS_KEYWORDS = {
    "General Conditions": [
        "infection", "condition", "disease", "syndrome", "disorder", "medicine"
        "abnormality", "deficiency", "injury", "trauma", "obstruction"
    ],
    "Specific Diagnoses": [
        "pneumonia", "bronchitis", "asthma", "diabetes", "hypertension",
        "migraine", "arthritis", "gastritis", "anemia", "thyroid",
        "cancer", "allergy", "inflammation"
    ],
    "Modifiers/Status": [
        "diagnosed", "likely", "appears to be", "consistent with",
        "indicative of", "suspected", "confirmed", "ruled out",
        "chronic", "acute", "recurrent", "persistent"
    ]
}

TREATMENT_KEYWORDS = {
    "Medications": [
        "prescribe", "medication", "antibiotics", "analgesics", "antihistamines",
        "steroids", "insulin", "beta-blockers", "antacids", "inhaler",
        "inhaled", "cream", "ointment", "chemotherapy", "radiation"
    ],
    "Procedures": [
        "surgery", "imaging", "x-ray", "ultrasound", "biopsy",
        "endoscopy", "referral", "monitor", "follow-up", "rehabilitation"
    ],
    "Lifestyle/Advice": [
        "rest", "hydrate", "diet", "exercise", "sleep", "fluids", "avoid",
        "stop smoking", "physical therapy", "counseling"
    ],
    "Dosages/Instructions": [
        "take", "continue", "adjust", "apply"
    ]
}

In [13]:
# Common negation words
NEGATION_WORDS = {
    "no", "not", "denies", "without", "deny", "denied",
    "don't", "doesn't", "ain't", "don’t", "doesn’t", "struggle", "struggling", "struggles", "struggled"
}

# Synonyms
SYMPTOM_SYNONYMS = {
  "shortness of breath": ["Dyspnea", "Breathlessness", "Labored breathing", "Difficulty breathing"],
  "fever": ["Pyrexia", "High temperature"],
  "pain": ["Ache", "Soreness"]
}

DIAGNOSIS_SYNONYMS = {
    "diagnosis": ["assessment", "evaluation", "identification"],
    "infection": ["contagion", "infestation", "sepsis"],
    "cancer": ["malignancy", "neoplasm", "carcinoma", "tumor"],
}
TREATMENT_SYNONYMS = {
    "treatment": ["therapy", "intervention", "management"],
    "medication": ["drug", "pharmaceutical", "pharmacotherapy"],
    "steroid": ["corticosteroid", "glucocorticoid"],
    "surgery": ["operation", "operative procedure", "surgical intervention"],
}

### Examples

In [4]:
dialogue_2 = [
    {
        "utterance_id": "utterance_0",
        "speaker": "SPEAKER_00",
        "start": 4.76,
        "end": 6.42,
        "text": "Hello, Mr. McKay."
    },
    {
        "utterance_id": "utterance_1",
        "speaker": "SPEAKER_00",
        "start": 7.28,
        "end": 9.02,
        "text": "What brings you here today?"
    },
    {
        "utterance_id": "utterance_2",
        "speaker": "SPEAKER_01",
        "start": 9.74,
        "end": 12.76,
        "text": "I have a fever, a sore stomach, and a shortness of breath. But I don't have diarrhea."
    },
    {
        "utterance_id": "utterance_3",
        "speaker": "SPEAKER_00",
        "start": 13.22,
        "end": 14.16,
        "text": "Okay, Tony."
    },
    {
        "utterance_id": "utterance_4",
        "speaker": "SPEAKER_00",
        "start": 15.28,
        "end": 19.0,
        "text": "I see your temperature is 104 degrees."
    },
    {
        "utterance_id": "utterance_5",
        "speaker": "SPEAKER_00",
        "start": 19.74,
        "end": 21.3,
        "text": "That's very high."
    },
    {
        "utterance_id": "utterance_6",
        "speaker": "SPEAKER_01",
        "start": 22.24,
        "end": 25.5,
        "text": "Yes, I feel very dizzy and nauseous."
    },
    {
        "utterance_id": "utterance_7",
        "speaker": "SPEAKER_00",
        "start": 25.8,
        "end": 27.2,
        "text": "Did you get sick?"
    },
    {
        "utterance_id": "utterance_8",
        "speaker": "SPEAKER_01",
        "start": 28.34,
        "end": 31.38,
        "text": "Yes, I vomited twice this morning."
    },
    {
        "utterance_id": "utterance_9",
        "speaker": "SPEAKER_00",
        "start": 31.78,
        "end": 33.44,
        "text": "Did you have any diarrhea?"
    },
    {
        "utterance_id": "utterance_10",
        "speaker": "SPEAKER_01",
        "start": 34.3,
        "end": 36.36,
        "text": "Yes, a little bit."
    },
    {
        "utterance_id": "utterance_11",
        "speaker": "SPEAKER_00",
        "start": 36.72,
        "end": 39.76,
        "text": "Did you take any medicine to treat your symptoms?"
    },
    {
        "utterance_id": "utterance_12",
        "speaker": "SPEAKER_01",
        "start": 40.42,
        "end": 42.86,
        "text": "No, doctor. I didn't take anything."
    },
    {
        "utterance_id": "utterance_13",
        "speaker": "SPEAKER_00",
        "start": 43.38,
        "end": 47.22,
        "text": "Okay, sounds like you may have some food poisoning."
    },
    {
        "utterance_id": "utterance_14",
        "speaker": "SPEAKER_01",
        "start": 47.9,
        "end": 48.9,
        "text": "Oh, no."
    },
    {
        "utterance_id": "utterance_15",
        "speaker": "SPEAKER_00",
        "start": 49.84,
        "end": 55.24,
        "text": "Take this medicine now and again every six hours until it's finished."
    },
    {
        "utterance_id": "utterance_16",
        "speaker": "SPEAKER_00",
        "start": 55.5,
        "end": 58.6,
        "text": "You'll be okay. You'll be okay in about 24 hours."
    },
    {
        "utterance_id": "utterance_17",
        "speaker": "SPEAKER_01",
        "start": 59.2,
        "end": 60.12,
        "text": "That's a relief."
    },
    {
        "utterance_id": "utterance_18",
        "speaker": "SPEAKER_01",
        "start": 60.52,
        "end": 62.36,
        "text": "Thank you very much, doctor. Thank you, doctor."
    }
]

In [5]:
dialogue_3 = [
    {
        "utterance_id": "utterance_0",
        "speaker": "SPEAKER_00",
        "start": 0.00,
        "end": 2.50,
        "text": "Good morning, Ms. Lee."
    },
    {
        "utterance_id": "utterance_1",
        "speaker": "SPEAKER_00",
        "start": 2.70,
        "end": 4.10,
        "text": "What brings you in today?"
    },
    {
        "utterance_id": "utterance_2",
        "speaker": "SPEAKER_01",
        "start": 4.50,
        "end": 7.20,
        "text": "Doctor, I've been feeling very fatigued and weak for the last three days."
    },
    {
        "utterance_id": "utterance_3",
        "speaker": "SPEAKER_00",
        "start": 7.60,
        "end": 9.00,
        "text": "Do you have any fever or chills?"
    },
    {
        "utterance_id": "utterance_4",
        "speaker": "SPEAKER_01",
        "start": 9.40,
        "end": 12.00,
        "text": "I’ve had low-grade fever, around 100°F last night, but no chills."
    },
    {
        "utterance_id": "utterance_5",
        "speaker": "SPEAKER_00",
        "start": 12.50,
        "end": 14.20,
        "text": "Any cough or shortness of breath?"
    },
    {
        "utterance_id": "utterance_6",
        "speaker": "SPEAKER_01",
        "start": 14.60,
        "end": 17.10,
        "text": "I cough sometimes, coughing often, and don't cough usually, but I don’t have any chest pain or tightness."
    },
    {
        "utterance_id": "utterance_7",
        "speaker": "SPEAKER_00",
        "start": 17.50,
        "end": 19.00,
        "text": "Do you feel dizzy or have any headaches?"
    },
    {
        "utterance_id": "utterance_8",
        "speaker": "SPEAKER_01",
        "start": 19.40,
        "end": 22.00,
        "text": "Yes, I’ve been a bit dizzy, and my head feels heavy at times."
    },
    {
        "utterance_id": "utterance_9",
        "speaker": "SPEAKER_00",
        "start": 22.40,
        "end": 24.10,
        "text": "Have you had any nausea, vomiting, or diarrhea?"
    },
    {
        "utterance_id": "utterance_10",
        "speaker": "SPEAKER_01",
        "start": 24.50,
        "end": 27.00,
        "text": "No vomiting, but I did have two episodes of mild diarrhea yesterday."
    },
    {
        "utterance_id": "utterance_11",
        "speaker": "SPEAKER_00",
        "start": 27.40,
        "end": 29.20,
        "text": "Any abdominal pain or cramping?"
    },
    {
        "utterance_id": "utterance_12",
        "speaker": "SPEAKER_01",
        "start": 29.60,
        "end": 32.10,
        "text": "My stomach feels a bit crampy, but it’s mostly discomfort rather than sharp pain."
    },
    {
        "utterance_id": "utterance_13",
        "speaker": "SPEAKER_00",
        "start": 32.50,
        "end": 34.00,
        "text": "Do you have any skin changes—rash, itching, or swelling?"
    },
    {
        "utterance_id": "utterance_14",
        "speaker": "SPEAKER_01",
        "start": 34.40,
        "end": 36.00,
        "text": "No rash, but I noticed some redness and mild swelling around my ankles."
    },
    {
        "utterance_id": "utterance_15",
        "speaker": "SPEAKER_00",
        "start": 36.40,
        "end": 38.20,
        "text": "Do you take any medications currently?"
    },
    {
        "utterance_id": "utterance_16",
        "speaker": "SPEAKER_01",
        "start": 38.60,
        "end": 41.00,
        "text": "I take metformin for diabetes, and I was on ibuprofen for my back pain until yesterday."
    },
    {
        "utterance_id": "utterance_17",
        "speaker": "SPEAKER_00",
        "start": 41.40,
        "end": 43.10,
        "text": "Understood. I suspect you may have early pneumonia."
    },
    {
        "utterance_id": "utterance_18",
        "speaker": "SPEAKER_00",
        "start": 43.50,
        "end": 45.20,
        "text": "Let's order a chest X-ray and a CBC to rule out infection."
    },
    {
        "utterance_id": "utterance_19",
        "speaker": "SPEAKER_00",
        "start": 45.60,
        "end": 48.00,
        "text": "In the meantime, start amoxicillin 500 mg every eight hours for five days."
    },
    {
        "utterance_id": "utterance_20",
        "speaker": "SPEAKER_00",
        "start": 48.40,
        "end": 50.00,
        "text": "Also, rest, stay hydrated with plenty of fluids, and use a humidifier at night."
    },
    {
        "utterance_id": "utterance_21",
        "speaker": "SPEAKER_01",
        "start": 50.40,
        "end": 52.00,
        "text": "Should I avoid going outside?"
    },
    {
        "utterance_id": "utterance_22",
        "speaker": "SPEAKER_00",
        "start": 52.40,
        "end": 54.00,
        "text": "Yes, avoid any strenuous activity and limit exposure to cold air."
    },
    {
        "utterance_id": "utterance_23",
        "speaker": "SPEAKER_01",
        "start": 54.40,
        "end": 56.00,
        "text": "Thank you, doctor. I’ll follow your instructions."
    }
]

### Common

In [6]:
# Flattened lists for regex building
_ALL_SYMPTOMS = [kw for sublist in SYMPTOM_KEYWORDS.values() for kw in sublist]
_ALL_DIAGNOSES = [kw for sublist in DIAGNOSIS_KEYWORDS.values() for kw in sublist]
_ALL_TREATMENTS = [kw for sublist in TREATMENT_KEYWORDS.values() for kw in sublist]

In [7]:
COMMON_SUFFIXES = r"(?:s|es|ing|ed|ful|less|ness|y)?"

In [14]:
# -------------------------------------------------------------------
# 1. Build regex pattern to match keywords + common suffixes
# -------------------------------------------------------------------

def keyword_to_regex(kw: str) -> str:
    if " " in kw:
        return re.escape(kw)                # multi-word: match exactly
    else:
        base = re.escape(kw)
        return rf"{base}{COMMON_SUFFIXES}"  # single-word + common suffixes

def build_keyword_pattern(keyword_list):
    subs = [keyword_to_regex(kw) for kw in keyword_list]
    subs.sort(key=lambda pat: len(pat), reverse=True)
    combined = r"\b(" + "|".join(subs) + r")\b"
    return re.compile(combined, flags=re.IGNORECASE)

SYMPTOM_PATTERN   = build_keyword_pattern(_ALL_SYMPTOMS)
DIAGNOSIS_PATTERN = build_keyword_pattern(_ALL_DIAGNOSES)
TREATMENT_PATTERN = build_keyword_pattern(_ALL_TREATMENTS)

DOSAGE_PATTERN = re.compile(r"\b(\d+\s?(?:mg|g|mcg|ml|units))\b", flags=re.IGNORECASE)

def is_negated(full_text: str, match_start: int):
    """
    Look back up to 50 characters before match_start. Tokenize (allowing straight or curly apostrophes),
    and check the last 5 tokens for any NEGATION_WORDS. If found, return the absolute index of that negation.
    Otherwise return None.
    """
    window_start = max(0, match_start - 50)
    window = full_text[window_start:match_start]

    # Tokenize so that contractions with ’ or ' remain intact
    tokens = []
    for m in re.finditer(r"\b[\w'’]+\b", window.lower()):
        tok = m.group(0)
        start_rel = m.start()
        tokens.append((tok, start_rel))

    # Check last 5 tokens
    for tok, start_rel in tokens[-5:]:
        if tok in NEGATION_WORDS:
            return window_start + start_rel

    return None

# --------------------------------------------------
# 2. Categorization function (captures negation)
# --------------------------------------------------

def categorize_sentences(transcript, doctor_id, patient_id):
    symptoms_extracted   = []
    diagnosis_extracted  = []
    treatment_extracted  = []

    for turn in transcript:
        speaker       = turn["speaker"]
        original_text = turn["text"]
        text          = original_text.lower()
        utt_id        = turn["utterance_id"]

        # -----------------------
        #  Patient → Symptoms
        # -----------------------
        if speaker == patient_id:
            for match in SYMPTOM_PATTERN.finditer(text):
                neg_start = is_negated(text, match.start())
                if neg_start is not None:
                    # capture from negation word through matched keyword
                    kw_phrase = original_text[neg_start : match.end()]
                    is_neg    = True
                else:
                    kw_phrase = match.group(1)
                    is_neg    = False

                # Find the right category by checking if base_kw appears in kw_phrase
                for cat, kw_list in SYMPTOM_KEYWORDS.items():
                    for base_kw in kw_list:
                        if base_kw.lower() in kw_phrase.lower():
                            syns = SYMPTOM_SYNONYMS.get(base_kw.lower(), [])
                            symptoms_extracted.append({
                                "category": cat,
                                "keyword": kw_phrase,
                                "synonyms": syns.copy(),
                                "negated": is_neg,
                                "utterance_id": utt_id,
                                "utterance_text": original_text
                            })
                            break
                    else:
                        continue
                    break

        # -----------------------
        #  Doctor → Diagnosis
        # -----------------------
        if speaker == doctor_id:
            for match in DIAGNOSIS_PATTERN.finditer(text):
                neg_start = is_negated(text, match.start())
                if neg_start is not None:
                    kw_phrase = original_text[neg_start : match.end()]
                    is_neg    = True
                else:
                    kw_phrase = match.group(1)
                    is_neg    = False

                for cat, kw_list in DIAGNOSIS_KEYWORDS.items():
                    for base_kw in kw_list:
                        if base_kw.lower() in kw_phrase.lower():
                            syns = DIAGNOSIS_SYNONYMS.get(base_kw.lower(), [])
                            diagnosis_extracted.append({
                                "category": cat,
                                "keyword": kw_phrase,
                                "synonyms": syns.copy(),
                                "negated": is_neg,
                                "utterance_id": utt_id,
                                "utterance_text": original_text
                            })
                            break
                    else:
                        continue
                    break

            # -----------------------
            #  Doctor → Treatment
            # -----------------------
            for match in TREATMENT_PATTERN.finditer(text):
                neg_start = is_negated(text, match.start())
                if neg_start is not None:
                    kw_phrase = original_text[neg_start : match.end()]
                    is_neg    = True
                else:
                    kw_phrase = match.group(1)
                    is_neg    = False

                for cat, kw_list in TREATMENT_KEYWORDS.items():
                    for base_kw in kw_list:
                        if base_kw.lower() in kw_phrase.lower():
                            syns = TREATMENT_SYNONYMS.get(base_kw.lower(), [])
                            treatment_extracted.append({
                                "category": cat,
                                "keyword": kw_phrase,
                                "synonyms": syns.copy(),
                                "negated": is_neg,
                                "utterance_id": utt_id,
                                "utterance_text": original_text
                            })
                            break
                    else:
                        continue
                    break

            # 2) Dosage‐style (e.g., "500 mg")
            for match in DOSAGE_PATTERN.finditer(text):
                neg_start = is_negated(text, match.start())
                if neg_start is not None:
                    kw_phrase = original_text[neg_start : match.end()]
                    is_neg    = True
                else:
                    kw_phrase = match.group(1)
                    is_neg    = False

                treatment_extracted.append({
                    "category": "Dosage",
                    "keyword": kw_phrase,
                    "negated": is_neg,
                    "utterance_id": utt_id,
                    "utterance_text": original_text
                })

    return symptoms_extracted, diagnosis_extracted, treatment_extracted

def process_transcript_json(transcript, file_path=""):
    doctor_id  = "SPEAKER_00"
    patient_id = "SPEAKER_01"
    symptoms, diagnosis, treatment = categorize_sentences(transcript, doctor_id, patient_id)
    return generate_clinical_note(symptoms, diagnosis, treatment)

In [15]:
ner_transcript = process_transcript_json(dialogue_2)
print(json.dumps(ner_transcript, indent=4, ensure_ascii=False))

{
    "Symptoms": [
        {
            "category": "General",
            "keyword": "fever",
            "synonyms": [
                "Pyrexia",
                "High temperature"
            ],
            "negated": false,
            "utterance_id": "utterance_2",
            "utterance_text": "I have a fever, a sore stomach, and a shortness of breath. But I don't have diarrhea."
        },
        {
            "category": "General",
            "keyword": "sore stomach",
            "synonyms": [],
            "negated": false,
            "utterance_id": "utterance_2",
            "utterance_text": "I have a fever, a sore stomach, and a shortness of breath. But I don't have diarrhea."
        },
        {
            "category": "Respiratory",
            "keyword": "shortness of breath",
            "synonyms": [
                "Dyspnea",
                "Breathlessness",
                "Labored breathing",
                "Difficulty breathing"
            ],
           

In [None]:
ner_transcript = process_transcript_json(dialogue_3)
print(json.dumps(ner_transcript, indent=4, ensure_ascii=False))

{
    "Symptoms": [
        {
            "category": "General",
            "keyword": "weak",
            "negated": false,
            "utterance_id": "utterance_2",
            "utterance_text": "Doctor, I've been feeling very fatigued and weak for the last three days."
        },
        {
            "category": "General",
            "keyword": "fever",
            "negated": false,
            "utterance_id": "utterance_4",
            "utterance_text": "I’ve had low-grade fever, around 100°F last night, but no chills."
        },
        {
            "category": "General",
            "keyword": "no chills",
            "negated": true,
            "utterance_id": "utterance_4",
            "utterance_text": "I’ve had low-grade fever, around 100°F last night, but no chills."
        },
        {
            "category": "Respiratory",
            "keyword": "cough",
            "negated": false,
            "utterance_id": "utterance_6",
            "utterance_text": "I cough 