In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers torch underthesea

Collecting underthesea
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Colle

# Vietnamese

In [None]:
import json
from underthesea import sent_tokenize
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

def save_as_json(data, filename):
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)
    print(f"Data saved to {filename}")

dialogue_dir = "/content/drive/MyDrive/ClinicalNotesGen/Speech2Text/transcripts/vietnamese"

# Vietnamese contextual cues for Diagnosis and Treatment
DIAGNOSIS_CUES = [
    "chẩn đoán", "có thể", "tình trạng", "bệnh", "hội chứng", "rối loạn",
    "dường như", "phù hợp với", "gợi ý", "nghi ngờ", "xác nhận"
]
TREATMENT_CUES = [
    "kê đơn", "thuốc", "nghỉ ngơi", "phẫu thuật", "liệu pháp", "ống hít",
    "uống nước", "theo dõi", "tái khám", "sử dụng", "uống", "tiếp tục"
]

# Load mT5 model and tokenizer
model_name = "google/mt5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def load_json_transcript(file_path):
    """Loads speaker-labeled transcript from JSON file."""
    with open(file_path, 'r') as f:
        return json.load(f)

def identify_speakers(data):
    """Heuristically identifies doctor vs. patient by order or keywords."""
    speakers = list({entry['speaker'] for entry in data})
    speakers.sort()  # Ensure consistent order
    if len(speakers) < 2:
        return speakers[0], speakers[0]
    return speakers[0], speakers[1]  # Assume SPEAKER_00 is doctor, SPEAKER_01 is patient

def summarize_text(text, max_length=50, min_length=10):
    """Summarize text using mT5 model."""
    if not text.strip():
        return "Không xác định."

    # Prepare input for mT5 (prepend "summarize: " as per T5 convention)
    input_text = f"summarize: {text}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    # Decode summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary.strip()

def categorize_and_summarize(transcript, doctor_id, patient_id):
    """Categorize sentences and summarize into SOAP components using mT5."""
    symptom_texts = []
    diagnosis_texts = []
    treatment_texts = []

    # Group sentences by category
    for turn in transcript:
        speaker = turn['speaker']
        text = turn['text'].lower()
        original_text = turn['text']

        # Split into sentences using underthesea
        sentences = sent_tokenize(original_text)

        if speaker == patient_id:
            # Patient sentences are candidates for Symptoms
            for sent in sentences:
                if any(term in sent.lower() for term in [
                    "sốt", "ho", "đau", "mệt mỏi", "chóng mặt", "buồn nôn", "khó thở"
                ]):  # Basic symptom terms
                    symptom_texts.append(sent)

        elif speaker == doctor_id:
            # Doctor sentences are candidates for Diagnosis or Treatment
            for sent in sentences:
                sent_lower = sent.lower()
                if any(cue in sent_lower for cue in DIAGNOSIS_CUES):
                    diagnosis_texts.append(sent)
                if any(cue in sent_lower for cue in TREATMENT_CUES):
                    treatment_texts.append(sent)

    # Summarize each category
    symptom_summary = summarize_text(" ".join(symptom_texts)) if symptom_texts else "Không xác định."
    diagnosis_summary = summarize_text(" ".join(diagnosis_texts)) if diagnosis_texts else "Không xác định."
    treatment_summary = summarize_text(" ".join(treatment_texts)) if treatment_texts else "Không xác định."

    return [symptom_summary], [diagnosis_summary], [treatment_summary]

def generate_clinical_note(symptoms, diagnosis, treatment):
    """Return structured clinical note as a Python dict."""
    return {
        "Symptoms": symptoms,
        "Diagnosis": diagnosis,
        "Treatment": treatment
    }

def process_transcript_json(json_path, file_name):
    transcript = load_json_transcript(json_path)
    doctor_id, patient_id = identify_speakers(transcript)
    symptoms, diagnosis, treatment = categorize_and_summarize(transcript, doctor_id, patient_id)

    note_dict = generate_clinical_note(symptoms, diagnosis, treatment)
    save_as_json(note_dict, f"{dialogue_dir}/{file_name}_summary.json")  # Structured JSON output

    print(json.dumps(note_dict, indent=4, ensure_ascii=False))  # Show result in console

process_transcript_json(f"{dialogue_dir}/example_a.json", "example_pretrained")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Data saved to /content/drive/MyDrive/ClinicalNotesGen/Speech2Text/transcripts/vietnamese/example_pretrained_summary.json
{
    "Symptoms": [
        "<extra_id_0> nặng, và tôi hơi buồn."
    ],
    "Diagnosis": [
        "<extra_id_0> sốt hoặc sụt cân"
    ],
    "Treatment": [
        "<extra_id_0> histamine để giảm đau."
    ]
}


# English

In [None]:
!pip install transformers torch spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m87.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import json
import spacy
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

def save_as_json(data, filename):
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)
    print(f"Data saved to {filename}")

dialogue_dir = "/content/drive/MyDrive/ClinicalNotesGen/Speech2Text/transcripts/english"

# English contextual cues for Diagnosis and Treatment
DIAGNOSIS_CUES = [
    "diagnosed", "likely", "condition", "disease", "syndrome", "disorder",
    "appears to be", "consistent with", "indicative of", "suspected", "confirmed"
]
TREATMENT_CUES = [
    "prescribe", "medication", "rest", "surgery", "therapy", "inhaler",
    "hydrate", "monitor", "follow-up", "apply", "take", "continue"
]

# Load SpaCy model for English
nlp = spacy.load("en_core_web_sm")

# Load mT5 model and tokenizer
model_name = "google/mt5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def load_json_transcript(file_path):
    """Loads speaker-labeled transcript from JSON file."""
    with open(file_path, 'r') as f:
        return json.load(f)

def identify_speakers(data):
    """Heuristically identifies doctor vs. patient by order or keywords."""
    speakers = list({entry['speaker'] for entry in data})
    speakers.sort()  # Ensure consistent order
    if len(speakers) < 2:
        return speakers[0], speakers[0]
    return speakers[0], speakers[1]  # Assume SPEAKER_00 is doctor, SPEAKER_01 is patient

def summarize_text(text, max_length=50, min_length=10):
    """Summarize text using mT5 model."""
    if not text.strip():
        return "None identified."

    # Prepare input for mT5 (prepend "summarize: " as per T5 convention)
    input_text = f"summarize: {text}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    # Decode summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary.strip()

def categorize_and_summarize(transcript, doctor_id, patient_id):
    """Categorize sentences and summarize into SOAP components using mT5."""
    symptom_texts = []
    diagnosis_texts = []
    treatment_texts = []

    # Group sentences by category
    for turn in transcript:
        speaker = turn['speaker']
        text = turn['text'].lower()
        original_text = turn['text']

        # Split into sentences using SpaCy
        doc = nlp(original_text)
        sentences = [sent.text.strip() for sent in doc.sents]

        if speaker == patient_id:
            # Patient sentences are candidates for Symptoms
            for sent in sentences:
                if any(term in sent.lower() for term in [
                    "fever", "cough", "pain", "tired", "headache", "dizzy", "nausea", "shortness of breath"
                ]):  # Basic symptom terms
                    symptom_texts.append(sent)

        elif speaker == doctor_id:
            # Doctor sentences are candidates for Diagnosis or Treatment
            for sent in sentences:
                sent_lower = sent.lower()
                if any(cue in sent_lower for cue in DIAGNOSIS_CUES):
                    diagnosis_texts.append(sent)
                if any(cue in sent_lower for cue in TREATMENT_CUES):
                    treatment_texts.append(sent)

    # Summarize each category
    symptom_summary = summarize_text(" ".join(symptom_texts)) if symptom_texts else "None identified."
    diagnosis_summary = summarize_text(" ".join(diagnosis_texts)) if diagnosis_texts else "None identified."
    treatment_summary = summarize_text(" ".join(treatment_texts)) if treatment_texts else "None identified."

    return [symptom_summary], [diagnosis_summary], [treatment_summary]

def generate_clinical_note(symptoms, diagnosis, treatment):
    """Return structured clinical note as a Python dict."""
    return {
        "Symptoms": symptoms,
        "Diagnosis": diagnosis,
        "Treatment": treatment
    }

def process_transcript_json(json_path, file_name):
    transcript = load_json_transcript(json_path)
    doctor_id, patient_id = identify_speakers(transcript)
    symptoms, diagnosis, treatment = categorize_and_summarize(transcript, doctor_id, patient_id)

    note_dict = generate_clinical_note(symptoms, diagnosis, treatment)
    save_as_json(note_dict, f"{dialogue_dir}/{file_name}_summary.json")  # Structured JSON output

    print(json.dumps(note_dict, indent=4))  # Show result in console

process_transcript_json(f"{dialogue_dir}/example_a.json", "example_pretrained")

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


Data saved to /content/drive/MyDrive/ClinicalNotesGen/Speech2Text/transcripts/english/example_pretrained_summary.json
{
    "Symptoms": [
        "<extra_id_0>, and I have some nausea."
    ],
    "Diagnosis": [
        "<extra_id_0> of an infection, possibly arthritis."
    ],
    "Treatment": [
        "<extra_id_0> for symptom relief and follow up. <extra_id_10>"
    ]
}


# Gemini API

In [None]:
from google.colab import userdata

In [None]:
!pip install -q -U google-genai

In [None]:
from google import genai

client = genai.Client(api_key=userdata.get('GOOGLE_API_KEY'))

In [None]:
import json

In [None]:
def load_json_transcript(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

In [None]:
dialogue_dir = "/content/drive/MyDrive/ClinicalNotesGen/Speech2Text/transcripts/english"

In [None]:
dialogue_dir_2 = "/content/drive/MyDrive/ClinicalNotesGen/Speech2Text/transcripts/vietnamese"

In [None]:
dia1 = load_json_transcript(f"{dialogue_dir}/example_a.json")

In [None]:
dia1

[{'speaker': 'SPEAKER_01',
  'text': 'I’ve been feeling really tired and have a high fever for three days. My cough is getting worse, and I feel chest tightness.'},
 {'speaker': 'SPEAKER_00',
  'text': 'Those symptoms sound concerning. Have you had any shortness of breath or wheezing?'},
 {'speaker': 'SPEAKER_01',
  'text': 'Yes, I’ve been wheezing a lot, and I feel dizzy when I stand up. Also, my throat is sore, and I have some nausea.'},
 {'speaker': 'SPEAKER_00',
  'text': 'Based on your fever, cough, wheezing, and chest tightness, it’s likely you have a respiratory infection, possibly pneumonia.'},
 {'speaker': 'SPEAKER_00',
  'text': 'I’m going to prescribe antibiotics to treat the infection and recommend plenty of rest and hydration.'},
 {'speaker': 'SPEAKER_01',
  'text': 'I also have some joint pain and swelling in my knees, and I’ve been losing weight without trying.'},
 {'speaker': 'SPEAKER_00',
  'text': 'The joint pain and weight loss could indicate an inflammatory conditio

In [None]:
response = client.models.generate_content(
    model="gemini-2.0-flash", contents=f"Summarize this dialogue and generate a clinical note in SOAP format: {dia1}"
)

In [None]:
print(response.text)

Okay, here's a summary of the dialogue and a clinical note in SOAP format:

**Summary:**

A patient presents with a 3-day history of fatigue, high fever, worsening cough, and chest tightness. They also report shortness of breath, wheezing, dizziness upon standing, sore throat, and nausea. Further questioning reveals joint pain and swelling (specifically in the knees), unintentional weight loss, headaches, and blurry vision (especially at night). The physician suspects a respiratory infection, possibly pneumonia, and prescribes antibiotics. The joint pain and weight loss raise suspicion for an inflammatory condition, potentially arthritis, and an X-ray is ordered. Headaches and vision issues prompt consideration of migraines or another neurological issue, and an imaging scan is planned.  Analgesics for joint pain and antihistamines for general symptom relief are prescribed, and a follow-up appointment is scheduled in one week. A referral to a specialist will be made for the inflammatory

In [None]:
dia2 = load_json_transcript(f"{dialogue_dir_2}/example_a.json")

In [None]:
dia2

[{'speaker': 'SPEAKER_01',
  'text': 'Tôi cảm thấy rất mệt mỏi và bị sốt cao trong ba ngày. Cơn ho của tôi ngày càng nặng, và tôi cảm thấy tức ngực.'},
 {'speaker': 'SPEAKER_00',
  'text': 'Những triệu chứng đó nghe đáng lo ngại. Bạn có bị khó thở hoặc thở khò khè không?'},
 {'speaker': 'SPEAKER_01',
  'text': 'Vâng, tôi bị thở khò khè nhiều, và tôi cảm thấy chóng mặt khi đứng dậy. Ngoài ra, họng tôi đau, và tôi hơi buồn nôn.'},
 {'speaker': 'SPEAKER_00',
  'text': 'Dựa trên sốt, ho, thở khò khè và tức ngực, có thể bạn bị nhiễm trùng đường hô hấp, có thể là viêm phổi.'},
 {'speaker': 'SPEAKER_00',
  'text': 'Tôi sẽ kê đơn kháng sinh để điều trị nhiễm trùng và khuyên bạn nghỉ ngơi nhiều và uống đủ nước.'},
 {'speaker': 'SPEAKER_01',
  'text': 'Tôi cũng bị đau khớp và sưng ở đầu gối, và tôi bị sụt cân mà không cố ý.'},
 {'speaker': 'SPEAKER_00',
  'text': 'Đau khớp và sụt cân có thể chỉ ra một tình trạng viêm, có thể là viêm khớp. Chúng ta cần chụp X-quang để xác nhận.'},
 {'speaker': 'S

In [None]:
response_2 = client.models.generate_content(
    model="gemini-2.0-flash", contents=f"Summarize this dialogue and generate a clinical note in SOAP format, answer in Vietnamese: {dia2}"
)

In [None]:
print(response_2.text)

Dưới đây là tóm tắt cuộc đối thoại và ghi chú lâm sàng theo định dạng SOAP bằng tiếng Việt:

**Tóm tắt:**

Bệnh nhân đến khám với các triệu chứng: mệt mỏi, sốt cao (3 ngày), ho nặng, tức ngực, khó thở, thở khò khè, chóng mặt, đau họng, buồn nôn, đau khớp (đặc biệt đầu gối sưng), sụt cân không chủ ý, đau đầu, và vấn đề về thị lực (mờ mắt, đặc biệt vào ban đêm).  Bác sĩ nghi ngờ nhiễm trùng đường hô hấp (có thể là viêm phổi), viêm khớp và các vấn đề thần kinh (có thể là đau nửa đầu).  Bác sĩ kê đơn kháng sinh, thuốc giảm đau, thuốc kháng histamine và yêu cầu chụp X-quang khớp gối và hình ảnh não. Bệnh nhân được giới thiệu đến bác sĩ chuyên khoa để đánh giá thêm và tái khám sau một tuần.

**Ghi chú lâm sàng (SOAP):**

**S (Subjective - Chủ quan):**

*   Bệnh nhân than phiền về tình trạng mệt mỏi, sốt cao trong 3 ngày, ho nặng, tức ngực, khó thở và thở khò khè.
*   Bệnh nhân cho biết chóng mặt khi đứng dậy, đau họng và buồn nôn.
*   Bệnh nhân cũng báo cáo đau khớp (sưng đầu gối), sụt cân k