In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import

In [2]:
import json

In [3]:
def save_to_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

def load_from_json(filename):
    """
    Load the conversation from a JSON file.
    """
    with open(filename, 'r') as f:
        return json.load(f)

# Rule-based

### Keywords

In [4]:
# Doctor indicators
doctor_keywords = [
    # Questions and inquiries
    "what brings you", "how are you feeling", "how long have you", "when did this start",
    "can you describe", "tell me about", "any other symptoms", "have you experienced",
    "do you have any", "are you taking", "have you tried", "how would you rate",
    "on a scale of", "does it hurt when", "can you point to", "how often do you",

    # Medical examination language
    "let me examine", "i'm going to", "let me check", "i need to", "let me listen",
    "take a deep breath", "say ah", "follow my finger", "look up", "look down",
    "turn your head", "can you lift", "does this hurt", "feel any pressure",

    # Medical recommendations and instructions
    "i recommend", "you should", "i suggest", "my advice", "you need to",
    "take this", "apply this", "rest for", "avoid", "come back in",
    "follow up", "schedule", "return if", "call if", "monitor",

    # Medical terminology and diagnosis
    "diagnosis", "condition", "infection", "inflammation", "prescription",
    "medication", "treatment", "therapy", "procedure", "test results",
    "blood work", "x-ray", "scan", "allergy", "dosage", "side effects",
    "medical history", "family history", "chronic", "acute", "symptoms indicate",

    # Professional phrases
    "in my opinion", "based on", "it appears", "it looks like", "i believe",
    "we need to rule out", "differential diagnosis", "likely cause", "i suspect"
]

# Comprehensive patient indicators
patient_keywords = [
    # Personal symptoms and feelings
    "i have", "i feel", "i'm experiencing", "i've been having", "i get",
    "i notice", "i can't", "i'm unable to", "it hurts", "it's painful",
    "i'm worried", "i'm concerned", "i think", "i believe", "i'm afraid",

    # Pain and discomfort descriptions
    "hurts", "pain", "painful", "ache", "aching", "sore", "tender",
    "burning", "stinging", "throbbing", "sharp", "dull", "cramping",
    "tight", "pressure", "uncomfortable", "bothering me", "killing me",

    # Symptom descriptions
    "sick", "nauseous", "dizzy", "tired", "weak", "fever", "chills",
    "headache", "stomach ache", "runny nose", "cough", "congested",
    "swollen", "rash", "itchy", "blurry", "ringing", "numbness",

    # Personal references and possessives
    "my head", "my back", "my stomach", "my chest", "my throat", "my arm",
    "my leg", "my eye", "my ear", "my skin", "my heart", "my breathing",

    # Timeline and frequency from patient perspective
    "started yesterday", "been going on", "happens when", "gets worse",
    "feels better", "comes and goes", "all the time", "at night", "in the morning",
    "after eating", "before bed", "during", "since", "for days", "for weeks",

    # Lifestyle and personal context
    "i work", "i sleep", "i eat", "i drink", "i smoke", "i exercise",
    "i live", "i usually", "normally i", "my job", "my family", "my wife",
    "my husband", "my kids", "at home", "at work"
]

### Rule-based function

In [5]:
def role_classification(dialogue, doctor_keywords, patient_keywords):
    """
    Fallback classification using rule-based approach if API fails.
    """
    speakers = list(set(segment["speaker"] for segment in dialogue))

    if len(speakers) != 2:
        # If not exactly 2 speakers, return default mapping
        return {speaker: "Unknown" for speaker in speakers}

    speaker_analysis = {}

    for speaker in speakers:
        speaker_texts = [seg["text"] for seg in dialogue if seg["speaker"] == speaker]
        combined_text = " ".join(speaker_texts).lower()

        # Rule-based scoring
        doctor_score = 0
        patient_score = 0

        for keyword in doctor_keywords:
            if keyword in combined_text:
                doctor_score += 1

        for keyword in patient_keywords:
            if keyword in combined_text:
                patient_score += 1

        # # First speaker is often doctor (greeting pattern)
        # if dialogue[0]["speaker"] == speaker and any(word in dialogue[0]["text"].lower()
        #                                            for word in ["hello", "hi", "good"]):
        #     doctor_score += 2

        speaker_analysis[speaker] = {"doctor_score": doctor_score, "patient_score": patient_score}

    # Assign roles based on scores
    result = {}
    speakers_by_doctor_score = sorted(speakers,
                                    key=lambda x: speaker_analysis[x]["doctor_score"],
                                    reverse=True)

    result[speakers_by_doctor_score[0]] = "Doctor"
    result[speakers_by_doctor_score[1]] = "Patient"

    return result

In [6]:
def replace_speaker_labels(transcript, speaker_labels):
    """
    Replace speaker labels in transcript with classified roles.

    Args:
        transcript List[Dict[str, Any]]: List of dialogue segments with speaker, start, end, and text
        speaker_labels : Dict[str, str]: Dictionary mapping original speaker IDs to roles (e.g., {"SPEAKER_00": "Doctor", "SPEAKER_01": "Patient"})

    Returns:
        -> List[Dict[str, Any]]: List of dialogue segments with updated speaker labels
    """
    updated_transcript = []

    for segment in transcript:
        # Create a copy of the segment to avoid modifying the original
        updated_segment = segment.copy()

        # Replace the speaker label with the classified role
        original_speaker = segment["speaker"]
        if original_speaker in speaker_labels:
            updated_segment["speaker"] = speaker_labels[original_speaker]
        else:
            # Keep original label if not found in speaker_labels
            print(f"Warning: Speaker '{original_speaker}' not found in speaker_labels. Keeping original label.")

        updated_transcript.append(updated_segment)

    return updated_transcript

### Main

In [10]:
def classify_speakers(name):
    input_path = f"/content/drive/MyDrive/ClinicalNotesGen/Data/transcripts/en/{name}_transcript.json"
    output_path = f"/content/drive/MyDrive/ClinicalNotesGen/Data/transcripts/en/labelled_transcripts/labelled_{name}.json"
    transcript = load_from_json(f"{input_path}")
    transcript_speakers = role_classification(transcript, doctor_keywords, patient_keywords)
    transcript_complete = replace_speaker_labels(transcript, transcript_speakers)
    save_to_json(transcript_complete, output_path)
    print(f"Saved transcript_complete to {output_path}")

In [8]:
name = "encounter_fever"
classify_speakers(name)

Saved transcript_complete to /content/drive/MyDrive/ClinicalNotesGen/Data/transcripts/en/labelled_transcripts/labelled_encounter_fever.json


In [11]:
name = "encounter_chest_pain"
classify_speakers(name)

Saved transcript_complete to /content/drive/MyDrive/ClinicalNotesGen/Data/transcripts/en/labelled_transcripts/labelled_encounter_chest_pain.json


In [12]:
name = "abdominal_pain_history"
classify_speakers(name)

Saved transcript_complete to /content/drive/MyDrive/ClinicalNotesGen/Data/transcripts/en/labelled_transcripts/labelled_abdominal_pain_history.json


In [13]:
name = "sexual_health_history"
classify_speakers(name)

Saved transcript_complete to /content/drive/MyDrive/ClinicalNotesGen/Data/transcripts/en/labelled_transcripts/labelled_sexual_health_history.json


In [14]:
name = "type_2_diabetes"
classify_speakers(name)

Saved transcript_complete to /content/drive/MyDrive/ClinicalNotesGen/Data/transcripts/en/labelled_transcripts/labelled_type_2_diabetes.json
