In [5]:
import os
import re
import json
from pathlib import Path
import subprocess
from typing import List, Tuple

In [9]:
root_dir = Path('/Users/khoinguyen/Documents/primock57')
textGrid_dir = root_dir / 'scripts' / 'textgrid'
transcript_dir = root_dir / 'transcripts'
audio_dir = root_dir / 'audio'
notes_dir = root_dir / 'notes'
output_dir = root_dir / 'output'
joined_transcripts_dir = output_dir / 'joined_transcripts'

## Create transcript to SOAP pipeline

In [None]:
# 1: Convert TextGrid files to transcript files
def convert_textgrid_to_transcripts():
    print("Converting TextGrid files to transcripts...")
    subprocess.run([
        "python", "scripts/textgrid_to_transcript.py",
        f"--transcript_path={transcript_dir}",
        f"--output_path={output_dir / 'joined_transcripts'}"
    ])



# 2: Extract utterances and align with audio
def extract_utterances():
    print("Extracting utterances from transcripts...")
    subprocess.run([
         "python", "scripts/extract_utterances.py",
        f"--audio_path={audio_dir}",
        f"--transcript_path={transcript_dir}",
        f"--output_path={output_dir}"
    ])
    
    
    
# 3: Parse SOAP-formatted note into sections
def split_soap_sections(note_text: str) -> dict:
    soap = {"Subjective": "", "Objective": "", "Assessment": "", "Plan": ""}
    current_section = None

    for line in note_text.splitlines():
        line = line.strip()
        if line.lower().startswith("subjective"):
            current_section = "Subjective"
        elif line.lower().startswith("objective"):
            current_section = "Objective"
        elif line.lower().startswith("assessment"):
            current_section = "Assessment"
        elif line.lower().startswith("plan"):
            current_section = "Plan"
        elif current_section:
            soap[current_section] += line + " "
    
    return soap


# Step 4: Pair transcripts with SOAP notes
def pair_transcripts_and_notes() -> List[Tuple[str, dict]]:
    print("Pairing transcripts with SOAP notes...")
    pairs = []
    
    transcript_files = sorted(joined_transcripts_dir.glob("*.txt"))
    note_files = sorted(notes_dir.glob("*.txt"))

    # Dict mapping stem (e.g. "day1_consultation01") to Path
    note_dict = {note.stem.lower(): note for note in note_files}
    print(f"Found {len(transcript_files)} transcripts and {len(note_dict)} notes.")

    for transcript_file in transcript_files:
        t_stem = transcript_file.stem.lower()
        
        if t_stem in note_dict:
            print(f"✅ Match: {transcript_file.name} ↔ {note_dict[t_stem].name}")
            note_file = note_dict[t_stem]
            transcript = transcript_file.read_text(encoding="utf-8")
            note = note_file.read_text(encoding="utf-8")
            soap = split_soap_sections(note)
            pairs.append((transcript, soap))
        else:
            print(f"❌ No match for: {transcript_file.name}")

    print(f"Paired {len(pairs)} transcript-note files.")
    return pairs

In [18]:
# 🏁 Run full pipeline
if __name__ == "__main__":
    convert_textgrid_to_transcripts()
    extract_utterances()
    paired_data = pair_transcripts_and_notes()

    # Show first sample
    if paired_data:
        transcript, soap = paired_data[0]
        print("\n=== First Transcript Sample ===\n", transcript[:500])
        print("\n=== First SOAP Sections ===")
        for section, content in soap.items():
            print(f"\n--- {section} ---\n{content[:300]}")

Converting TextGrid files to transcripts...
/Users/khoinguyen/Documents/primock57/transcripts/day5_consultation04_doctor.TextGrid
/Users/khoinguyen/Documents/primock57/transcripts/day3_consultation04_doctor.TextGrid
/Users/khoinguyen/Documents/primock57/transcripts/day5_consultation12_doctor.TextGrid
/Users/khoinguyen/Documents/primock57/transcripts/day4_consultation02_doctor.TextGrid
/Users/khoinguyen/Documents/primock57/transcripts/day2_consultation02_doctor.TextGrid
/Users/khoinguyen/Documents/primock57/transcripts/day1_consultation11_doctor.TextGrid
/Users/khoinguyen/Documents/primock57/transcripts/day2_consultation05_doctor.TextGrid
/Users/khoinguyen/Documents/primock57/transcripts/day4_consultation05_doctor.TextGrid
/Users/khoinguyen/Documents/primock57/transcripts/day1_consultation08_doctor.TextGrid
/Users/khoinguyen/Documents/primock57/transcripts/day3_consultation03_doctor.TextGrid
/Users/khoinguyen/Documents/primock57/transcripts/day1_consultation07_doctor.TextGrid
/Users/kho

 98%|█████████▊| 6999/7109 [00:12<00:00, 714.20it/s]

Writing reference transcript file...
Done!
Pairing transcripts with SOAP notes...
Found 57 transcripts and 57 notes.
✅ Match: day1_consultation01.txt ↔ day1_consultation01.txt
✅ Match: day1_consultation02.txt ↔ day1_consultation02.txt
✅ Match: day1_consultation03.txt ↔ day1_consultation03.txt
✅ Match: day1_consultation04.txt ↔ day1_consultation04.txt
✅ Match: day1_consultation05.txt ↔ day1_consultation05.txt
✅ Match: day1_consultation06.txt ↔ day1_consultation06.txt
✅ Match: day1_consultation07.txt ↔ day1_consultation07.txt
✅ Match: day1_consultation08.txt ↔ day1_consultation08.txt
✅ Match: day1_consultation09.txt ↔ day1_consultation09.txt
✅ Match: day1_consultation10.txt ↔ day1_consultation10.txt
✅ Match: day1_consultation11.txt ↔ day1_consultation11.txt
✅ Match: day1_consultation12.txt ↔ day1_consultation12.txt
✅ Match: day1_consultation13.txt ↔ day1_consultation13.txt
✅ Match: day1_consultation14.txt ↔ day1_consultation14.txt
✅ Match: day1_consultation15.txt ↔ day1_consultation15.tx

100%|██████████| 7109/7109 [00:12<00:00, 555.91it/s]


## Training data generation

In [21]:
transcript_dir = Path("/Users/khoinguyen/Documents/primock57/output/joined_transcripts")
note_dir = Path("/Users/khoinguyen/Documents/primock57/notes")
output_path = Path("mistral_prompt_data.jsonl")

In [25]:
with output_path.open("w", encoding="utf-8") as outfile:
    for transcript_file in sorted(transcript_dir.glob("*.txt")):
        base_name = transcript_file.stem
        note_file = note_dir / f"{base_name}.txt"

        if not note_file.exists():
            print(f"❌ Missing SOAP note for {base_name}")
            continue

        transcript = transcript_file.read_text(encoding="utf-8").strip()
        note = note_file.read_text(encoding="utf-8").strip()

        prompt = f"""Below is a conversation between a doctor and a patient.

Transcript:
{transcript}

Generate a SOAP note from this conversation."""

        entry = {
            "prompt": prompt,
            "response": note
        }

        outfile.write(json.dumps(entry) + "\n")

print(f"✅ Saved formatted data to {output_path.resolve()}")

✅ Saved formatted data to /Users/khoinguyen/Documents/primock57/mistral_prompt_data.jsonl


In [None]:
with open("mistral_prompt_data.jsonl") as f:
    example = json.loads(next(f))

response = subprocess.run(
    ["ollama", "run", "mistral"],
    input=example["prompt"],
    capture_output=True,
    text=True
)

print("Model Output:\n", response.stdout)

Model Output:
  Subjective: The patient presented with symptoms of diarrhea (loose and watery stool), abdominal pain (cramp-like sensation in the lower left side), weakness, and shakiness for the past three days. He also reported a loss of appetite but has been able to maintain fluid intake. Vomiting was initially present but has since stopped. The patient did not report any blood in either his stools or vomit. The symptoms started around 3-4 days ago, and he remembers having takeaway from a Chinese restaurant during that time. He is an accountant and has been going to work despite the difficulties. He uses an inhaler for asthma, which is well-controlled.

Objective: On physical examination, no abnormalities were noted. The patient's temperature was not measured, but he reported feeling hot around the onset of symptoms without a formal measurement.

Assessment: The patient is likely suffering from gastroenteritis, possibly triggered by the Chinese takeaway he consumed. This condition t

## Evaluation pipeline

In [28]:
input_file = "mistral_prompt_data.jsonl"
output_file = "mistral_prompt_data_processed.jsonl"

In [None]:
#  Looping over all 57 prompts and saving Mistral’s outputs
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for i, line in enumerate(infile, 1):
        item = json.loads(line)
        prompt = item["prompt"]
        reference = item["response"]

        print(f"🧠 Generating SOAP for example {i}...")

        result = subprocess.run(
            ["ollama", "run", "mistral"],
            input=prompt,
            capture_output=True,
            text=True
        )

        generated = result.stdout.strip()

        output = {
            "id": i,
            "prompt": prompt,
            "reference": reference,
            "generated": generated
        }

        outfile.write(json.dumps(output) + "\n")

print(f"✅ Saved all generated notes to {output_file}")

🧠 Generating SOAP for example 1...
🧠 Generating SOAP for example 2...
🧠 Generating SOAP for example 3...
🧠 Generating SOAP for example 4...
🧠 Generating SOAP for example 5...
🧠 Generating SOAP for example 6...
🧠 Generating SOAP for example 7...
🧠 Generating SOAP for example 8...
🧠 Generating SOAP for example 9...
🧠 Generating SOAP for example 10...
🧠 Generating SOAP for example 11...
🧠 Generating SOAP for example 12...
🧠 Generating SOAP for example 13...
🧠 Generating SOAP for example 14...
🧠 Generating SOAP for example 15...
🧠 Generating SOAP for example 16...
🧠 Generating SOAP for example 17...
🧠 Generating SOAP for example 18...
🧠 Generating SOAP for example 19...
🧠 Generating SOAP for example 20...
🧠 Generating SOAP for example 21...
🧠 Generating SOAP for example 22...
🧠 Generating SOAP for example 23...
🧠 Generating SOAP for example 24...
🧠 Generating SOAP for example 25...
🧠 Generating SOAP for example 26...
🧠 Generating SOAP for example 27...
🧠 Generating SOAP for example 28...
🧠

In [3]:
# Evaluate with ROUGE
from rouge_score import rouge_scorer

In [7]:
result_file = "mistral_prompt_data_processed.jsonl"
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

with open(result_file, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        generated = item["generated"]
        reference = item["reference"]

        score = scorer.score(reference, generated)
        for key in scores:
            scores[key].append(score[key].fmeasure)
            
# Calculate average scores
avg_scores = {key: sum(values) / len(values) for key, values in scores.items()}

print("Average ROUGE Scores:\n")
for key, value in avg_scores.items():
    print(f"{key}: {value:.4f}")

Average ROUGE Scores:

rouge1: 0.5956
rouge2: 0.2622
rougeL: 0.3494


## Filter results that are missing SOAP section and use Gradio to create a web app

In [9]:
required_sections = ["Subjective", "Objective", "Assessment", "Plan"]

def has_all_sections(text):
    return all(re.search(rf"(?i)\b{section}\b", text) for section in required_sections)

with open("mistral_prompt_data_processed.jsonl", "r") as infile, \
     open("mistral_valid.jsonl", "w") as valid_out, \
     open("mistral_incomplete.jsonl", "w") as invalid_out:

    for line in infile:
        item = json.loads(line)
        generated = item["generated"]

        if has_all_sections(generated):
            valid_out.write(json.dumps(item) + "\n")
        else:
            invalid_out.write(json.dumps(item) + "\n")

print("✅ Split results into:")
print("- mistral_valid.jsonl (complete SOAP notes)")
print("- mistral_incomplete.jsonl (missing sections)")

✅ Split results into:
- mistral_valid.jsonl (complete SOAP notes)
- mistral_incomplete.jsonl (missing sections)
