In [1]:
import os
import re
import json
from pathlib import Path
import subprocess
from typing import List, Tuple

In [7]:
root_dir = Path('/Users/khoinguyen/Documents/clinical-soapgen')
textGrid_dir = root_dir / 'scripts' / 'textgrid'
transcript_dir = root_dir / 'transcripts'
audio_dir = root_dir / 'audio'
notes_dir = root_dir / 'notes'
output_dir = root_dir / 'output'
joined_transcripts_dir = output_dir / 'joined_transcripts'

## Create transcript to SOAP pipeline

In [8]:
# 1: Convert TextGrid files to transcript files
def convert_textgrid_to_transcripts():
    print("Converting TextGrid files to transcripts...")
    subprocess.run([
        "python", "scripts/textgrid_to_transcript.py",
        f"--transcript_path={transcript_dir}",
        f"--output_path={output_dir / 'joined_transcripts'}"
    ])



# 2: Extract utterances and align with audio
def extract_utterances():
    print("Extracting utterances from transcripts...")
    subprocess.run([
         "python", "scripts/extract_utterances.py",
        f"--audio_path={audio_dir}",
        f"--transcript_path={transcript_dir}",
        f"--output_path={output_dir}"
    ])
    
    
    
# 3: Parse SOAP-formatted note into sections
def split_soap_sections(note_text: str) -> dict:
    soap = {"Subjective": "", "Objective": "", "Assessment": "", "Plan": ""}
    current_section = None

    for line in note_text.splitlines():
        line = line.strip()
        if line.lower().startswith("subjective"):
            current_section = "Subjective"
        elif line.lower().startswith("objective"):
            current_section = "Objective"
        elif line.lower().startswith("assessment"):
            current_section = "Assessment"
        elif line.lower().startswith("plan"):
            current_section = "Plan"
        elif current_section:
            soap[current_section] += line + " "
    
    return soap


# Step 4: Pair transcripts with SOAP notes
def pair_transcripts_and_notes() -> List[Tuple[str, dict]]:
    print("Pairing transcripts with SOAP notes...")
    pairs = []
    
    transcript_files = sorted(joined_transcripts_dir.glob("*.txt"))
    note_files = sorted(notes_dir.glob("*.txt"))

    # Dict mapping stem (e.g. "day1_consultation01") to Path
    note_dict = {note.stem.lower(): note for note in note_files}
    print(f"Found {len(transcript_files)} transcripts and {len(note_dict)} notes.")

    for transcript_file in transcript_files:
        t_stem = transcript_file.stem.lower()
        
        if t_stem in note_dict:
            print(f"‚úÖ Match: {transcript_file.name} ‚Üî {note_dict[t_stem].name}")
            note_file = note_dict[t_stem]
            transcript = transcript_file.read_text(encoding="utf-8")
            note = note_file.read_text(encoding="utf-8")
            soap = split_soap_sections(note)
            pairs.append((transcript, soap))
        else:
            print(f"‚ùå No match for: {transcript_file.name}")

    print(f"Paired {len(pairs)} transcript-note files.")
    return pairs

In [9]:
# üèÅ Run full pipeline
if __name__ == "__main__":
    convert_textgrid_to_transcripts()
    extract_utterances()
    paired_data = pair_transcripts_and_notes()

    # Show first sample
    if paired_data:
        transcript, soap = paired_data[0]
        print("\n=== First Transcript Sample ===\n", transcript[:500])
        print("\n=== First SOAP Sections ===")
        for section, content in soap.items():
            print(f"\n--- {section} ---\n{content[:300]}")

Converting TextGrid files to transcripts...
/Users/khoinguyen/Documents/clinical-soapgen/transcripts/day5_consultation04_doctor.TextGrid
/Users/khoinguyen/Documents/clinical-soapgen/transcripts/day3_consultation04_doctor.TextGrid
/Users/khoinguyen/Documents/clinical-soapgen/transcripts/day5_consultation12_doctor.TextGrid
/Users/khoinguyen/Documents/clinical-soapgen/transcripts/day4_consultation02_doctor.TextGrid
/Users/khoinguyen/Documents/clinical-soapgen/transcripts/day2_consultation02_doctor.TextGrid
/Users/khoinguyen/Documents/clinical-soapgen/transcripts/day1_consultation11_doctor.TextGrid
/Users/khoinguyen/Documents/clinical-soapgen/transcripts/day2_consultation05_doctor.TextGrid
/Users/khoinguyen/Documents/clinical-soapgen/transcripts/day4_consultation05_doctor.TextGrid
/Users/khoinguyen/Documents/clinical-soapgen/transcripts/day1_consultation08_doctor.TextGrid
/Users/khoinguyen/Documents/clinical-soapgen/transcripts/day3_consultation03_doctor.TextGrid
/Users/khoinguyen/Document

 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 7035/7109 [00:13<00:00, 615.87it/s]

Writing reference transcript file...
Done!
Pairing transcripts with SOAP notes...
Found 57 transcripts and 57 notes.
‚úÖ Match: day1_consultation01.txt ‚Üî day1_consultation01.txt
‚úÖ Match: day1_consultation02.txt ‚Üî day1_consultation02.txt
‚úÖ Match: day1_consultation03.txt ‚Üî day1_consultation03.txt
‚úÖ Match: day1_consultation04.txt ‚Üî day1_consultation04.txt
‚úÖ Match: day1_consultation05.txt ‚Üî day1_consultation05.txt
‚úÖ Match: day1_consultation06.txt ‚Üî day1_consultation06.txt
‚úÖ Match: day1_consultation07.txt ‚Üî day1_consultation07.txt
‚úÖ Match: day1_consultation08.txt ‚Üî day1_consultation08.txt
‚úÖ Match: day1_consultation09.txt ‚Üî day1_consultation09.txt
‚úÖ Match: day1_consultation10.txt ‚Üî day1_consultation10.txt
‚úÖ Match: day1_consultation11.txt ‚Üî day1_consultation11.txt
‚úÖ Match: day1_consultation12.txt ‚Üî day1_consultation12.txt
‚úÖ Match: day1_consultation13.txt ‚Üî day1_consultation13.txt
‚úÖ Match: day1_consultation14.txt ‚Üî day1_consultation14.txt
‚

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7109/7109 [00:13<00:00, 523.11it/s]


## Training data generation

In [15]:
transcript_dir = Path("/Users/khoinguyen/Documents/clinical-soapgen/output/joined_transcripts")
note_dir = Path("/Users/khoinguyen/Documents/clinical-soapgen/notes")
output_path = Path("mistral_prompt_data.jsonl")

In [16]:
with output_path.open("w", encoding="utf-8") as outfile:
    for transcript_file in sorted(transcript_dir.glob("*.txt")):
        base_name = transcript_file.stem
        note_file = note_dir / f"{base_name}.txt"

        if not note_file.exists():
            print(f"‚ùå Missing SOAP note for {base_name}")
            continue

        transcript = transcript_file.read_text(encoding="utf-8").strip()
        note = note_file.read_text(encoding="utf-8").strip()

        prompt = f"""Below is a conversation between a doctor and a patient.

Transcript:
{transcript}

Generate a SOAP note from this conversation."""

        entry = {
            "prompt": prompt,
            "response": note
        }

        outfile.write(json.dumps(entry) + "\n")

print(f"‚úÖ Saved formatted data to {output_path.resolve()}")

‚úÖ Saved formatted data to /Users/khoinguyen/Documents/GitHub/clinical-soapgen/mistral_prompt_data.jsonl


In [17]:
with open("mistral_prompt_data.jsonl") as f:
    example = json.loads(next(f))

response = subprocess.run(
    ["ollama", "run", "mistral"],
    input=example["prompt"],
    capture_output=True,
    text=True
)

print("Model Output:\n", response.stdout)

Model Output:
  Subjective:
- The patient presented with symptoms of diarrhea for the past three days, including loose and watery stool, frequent bowel movements (6-7 times a day), lower abdominal pain on the left side that comes and goes, weakness, and shaking. He mentioned feeling shaky but not feverish and did experience vomiting at the start of the symptoms which has since stopped. The patient also reported a loss of appetite but is able to drink fluids.

Objective:
- Abdominal tenderness on left lower quadrant palpation, no rebound or guarding. Normal vital signs except for mild fever three days ago. No blood in stool or vomit.

Assessment:
- The patient appears to be suffering from gastroenteritis, possibly caused by a viral or bacterial infection, due to the presentation of diarrhea, abdominal pain, and vomiting, with no other significant past medical history. The patient reported eating at a Chinese restaurant four days ago, which may have been a trigger for the symptoms.

Plan

## Evaluation pipeline

In [18]:
input_file = "mistral_prompt_data.jsonl"
output_file = "mistral_prompt_data_processed.jsonl"

In [19]:
#  Looping over all 57 prompts and saving Mistral‚Äôs outputs
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for i, line in enumerate(infile, 1):
        item = json.loads(line)
        prompt = item["prompt"]
        reference = item["response"]

        print(f"üß† Generating SOAP for example {i}...")

        result = subprocess.run(
            ["ollama", "run", "mistral"],
            input=prompt,
            capture_output=True,
            text=True
        )

        generated = result.stdout.strip()

        output = {
            "id": i,
            "prompt": prompt,
            "reference": reference,
            "generated": generated
        }

        outfile.write(json.dumps(output) + "\n")

print(f"‚úÖ Saved all generated notes to {output_file}")

üß† Generating SOAP for example 1...
üß† Generating SOAP for example 2...
üß† Generating SOAP for example 3...
üß† Generating SOAP for example 4...
üß† Generating SOAP for example 5...
üß† Generating SOAP for example 6...
üß† Generating SOAP for example 7...
üß† Generating SOAP for example 8...
üß† Generating SOAP for example 9...
üß† Generating SOAP for example 10...
üß† Generating SOAP for example 11...
üß† Generating SOAP for example 12...
üß† Generating SOAP for example 13...
üß† Generating SOAP for example 14...
üß† Generating SOAP for example 15...
üß† Generating SOAP for example 16...
üß† Generating SOAP for example 17...
üß† Generating SOAP for example 18...
üß† Generating SOAP for example 19...
üß† Generating SOAP for example 20...
üß† Generating SOAP for example 21...
üß† Generating SOAP for example 22...
üß† Generating SOAP for example 23...
üß† Generating SOAP for example 24...
üß† Generating SOAP for example 25...
üß† Generating SOAP for example 2

In [20]:
# Evaluate with ROUGE
from rouge_score import rouge_scorer

In [21]:
result_file = "mistral_prompt_data_processed.jsonl"
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

with open(result_file, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        generated = item["generated"]
        reference = item["reference"]

        score = scorer.score(reference, generated)
        for key in scores:
            scores[key].append(score[key].fmeasure)
            
# Calculate average scores
avg_scores = {key: sum(values) / len(values) for key, values in scores.items()}

print("Average ROUGE Scores:\n")
for key, value in avg_scores.items():
    print(f"{key}: {value:.4f}")

Average ROUGE Scores:

rouge1: 0.5898
rouge2: 0.2584
rougeL: 0.3462


## Filter results that are missing SOAP section and use Gradio to create a web app

In [None]:
required_sections = ["Subjective", "Objective", "Assessment", "Plan"]

def has_all_sections(text):
    return all(re.search(rf"(?i)\b{section}\b", text) for section in required_sections)

with open("mistral_prompt_data_processed.jsonl", "r") as infile, \
     open("mistral_valid.jsonl", "w") as valid_out, \
     open("mistral_incomplete.jsonl", "w") as invalid_out:

    for line in infile:
        item = json.loads(line)
        generated = item["generated"]

        if has_all_sections(generated):
            valid_out.write(json.dumps(item) + "\n")
        else:
            invalid_out.write(json.dumps(item) + "\n")

print("‚úÖ Split results into:")
print("- mistral_valid.jsonl (complete SOAP notes)")
print("- mistral_incomplete.jsonl (missing sections)")

‚úÖ Split results into:
- mistral_valid.jsonl (complete SOAP notes)
- mistral_incomplete.jsonl (missing sections)


In [22]:
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
def generate_soap(prompt):
    """Call the Mistral model via subprocess and return the generated SOAP note."""
    result = subprocess.run(
        ["ollama", "run", "mistral"],
        input=prompt,
        capture_output=True,
        text=True
    )
    return result.stdout.strip()

# Gradio interface
gr.Interface(
    fn=generate_soap,
    inputs=gr.Textbox(lines=15, label="Paste transcript here"),
    outputs=gr.Textbox(lines=20, label="Generated SOAP note"),
    title="SOAP Note Generator",
    description="Paste a transcript of a doctor-patient conversation to generate a SOAP note using Mistral."
).launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


