In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Libraries

In [None]:
import os
from pathlib import Path
import shutil
import subprocess
from IPython.display import Javascript
from IPython.display import Audio
import re
from google.colab import output
from base64 import b64decode
import json

In [None]:
!pip install -q --upgrade pip
!pip install -q --upgrade transformers datasets[audio] accelerate
# !pip install -q --upgrade transformers accelerate
!pip install -q torch torchvision torchaudio
!pip install -q pyannote.audio
!pip install -q -U openai-whisper

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m167.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m182.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m161.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

- Speech2Text

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import whisper
from whisper import load_model

- Speaker Diarization

In [None]:
from pyannote.audio import Pipeline

# FUNCTIONS

### Helpers

In [None]:
def convert_to_wav(input_path: str, output_path: str) -> str:
    # Ensure the output path has .wav extension
    output_wav_path = str(Path(output_path).with_suffix(".wav"))

    subprocess.run([
        "ffmpeg", "-y", "-i", input_path,
        "-ac", "1", "-ar", "16000", output_wav_path
    ], check=True)

    return output_wav_path

In [None]:
def convert_all_mp3_to_wav(input_dir, output_dir):
    """
    Convert all .mp3 files in input_dir to .wav and save them in output_dir.
    """
    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    for mp3_file in input_dir.glob("*.mp3"):
        try:
            # Convert to wav using your existing function
            temp_wav_path = convert_to_wav(str(mp3_file))

            # Move the .wav to output_dir
            final_wav_path = output_dir / Path(temp_wav_path).name
            shutil.move(temp_wav_path, final_wav_path)

            print(f"✅ Converted: {mp3_file.name} → {final_wav_path.name}")
        except Exception as e:
            print(f"❌ Failed to convert {mp3_file.name}: {e}")

In [None]:
def save_to_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

In [None]:
def load_from_json(filename):
    """
    Load the conversation from a JSON file.
    """
    with open(filename, 'r') as f:
        return json.load(f)

### Transcribe

In [None]:
def transcribe_with_whisper(audio_path, model, language="en"):
    """Run Whisper to get a list of tokens with timestamps."""
    # word_timestamps=True for word-level timing
    result = model.transcribe(
        audio_path,
        word_timestamps=True,
        language=language  # e.g., "en", "vi", "ja", etc.
    )
    # result["segments"] is a list of dicts with words inside
    tokens = []
    for seg in result["segments"]:
        for word_info in seg["words"]:
            tokens.append({
                "word": word_info["word"].strip(),
                "start": round(word_info["start"], 2),
                "end":   round(word_info["end"], 2)
            })
    return tokens

### Diarize

In [None]:
def diarize_with_pyannote(audio_path, pipeline, device="cuda"):
    """Run pyannote speaker diarization pipeline."""
    pipeline.to(torch.device(device))
    diarization = pipeline({"audio": audio_path})

    segments = [
        {
            "start": round(turn.start, 2),
            "end": round(turn.end, 2),
            "speaker": speaker
        }
        for turn, _, speaker in diarization.itertracks(yield_label=True)
    ]

    return segments

In [None]:
def assign_speakers(tokens, segments):
    """
    For each token, find the diarization segment it falls into.
    If no segment covers its start time, assign 'UNK'.
    """
    diarized_tokens = []
    idx = 0
    # sort segments by start time
    segments = sorted(segments, key=lambda x: x["start"])
    for token in tokens:
        # advance idx until segment might cover token
        while idx + 1 < len(segments) and segments[idx]["end"] < token["start"]:
            idx += 1
        seg = segments[idx]
        speaker = seg["speaker"] if seg["start"] <= token["start"] <= seg["end"] else "UNK"
        diarized_tokens.append({**token, "speaker": speaker})
    return diarized_tokens

In [None]:
def build_diarized_transcript(diarized_tokens):
    """
    Group contiguous tokens with same speaker into utterances.
    Returns list of {speaker, start, end, text}.
    """
    if not diarized_tokens:
        return []
    utterances = []
    cur = {
        "speaker": diarized_tokens[0]["speaker"],
        "start":   diarized_tokens[0]["start"],
        "end":     diarized_tokens[0]["end"],
        "text":    diarized_tokens[0]["word"]
    }
    for tok in diarized_tokens[1:]:
        if tok["speaker"] == cur["speaker"]:
            cur["end"] = tok["end"]
            cur["text"] += " " + tok["word"]
        else:
            utterances.append(cur)
            cur = {
                "speaker": tok["speaker"],
                "start":   tok["start"],
                "end":     tok["end"],
                "text":    tok["word"]
            }
    utterances.append(cur)
    return utterances

### Clean conversation

In [None]:
def merge_unk_into_next(utterances):
    """
    Given a list of {'speaker','start','end','text'} utterances,
    merge any UNK utterance into the next real speaker.
    """
    merged = []
    i = 0
    while i < len(utterances):
        utt = utterances[i]
        # If this is an UNK and there *is* a following utterance, merge it there
        if utt["speaker"] == "UNK" and i + 1 < len(utterances):
            next_utt = utterances[i + 1]
            # prepend the UNK text and adjust the start time
            next_utt["text"]  = utt["text"] + " " + next_utt["text"]
            next_utt["start"] = utt["start"]
            # we skip appending utt itself
        else:
            # regular speaker, just keep it
            merged.append(utt)
        i += 1
    return merged

In [None]:
def add_utterance_ids(utterances, prefix="U"):
    """
    Parameters:
        utterances (list of dict): List of utterance dictionaries.
        prefix (str): Prefix for utterance IDs, default is 'U'.

    Returns:
        list of dict: Modified list with 'utterance_id' added to each item.
    """
    for i, utt in enumerate(utterances, start=1):
        utt["utterance_id"] = f"{prefix}{i}"
    return utterances

In [None]:
def format_time(t):
    m, s = divmod(int(t), 60)
    return f"{m:02d}:{s:02d}"

def clean_text(text):
    # 1. Remove all spaces around hyphens
    text = re.sub(r'\s*-\s*', '-', text)
    # 2. Remove spaces before punctuation (commas, periods, question/exclamation, colons, semicolons)
    text = re.sub(r'\s+([,\.!?;:])', r'\1', text)
    # 3. Collapse multiple spaces into one
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

def format_conversation(utterances):
    """
    Given a list of dicts with keys ['speaker','start','end','text'],
    print them one per line as a chat.
    """
    for utt in utterances:
        start = format_time(utt["start"])
        end   = format_time(utt["end"])
        spk   = utt["speaker"]
        # spk = {"SPEAKER_00":"Speaker 1","SPEAKER_01":"Speaker 2"}.get(utt["speaker"], utt["speaker"])
        text  = clean_text(utt["text"].strip())
        # text  = utt["text"].strip()
        print(f"[{start}s–{end}s] {spk}: {text}")

In [None]:
def format_conversation_2(utterances):
    """
    Format utterances as a chat, grouping by speaker without timestamps.
    """
    if not utterances:
        return

    current_speaker = None
    for utt in utterances:
        speaker = utt["speaker"]
        text  = clean_text(utt["text"].strip())

        if speaker != current_speaker:
            print(f"\n{speaker}:")
            current_speaker = speaker
        print(text)

# MAIN

In [None]:
from google.colab import userdata

## Initialization

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [None]:
model_name="large-v3"
model_whisper = load_model(model_name, device=device)

100%|█████████████████████████████████████| 2.88G/2.88G [00:33<00:00, 92.1MiB/s]


In [None]:
HF_TOKEN = userdata.get('HF_TOKEN_2')
pipeline_name = "pyannote/speaker-diarization"
pipeline_diarization = Pipeline.from_pretrained(pipeline_name, use_auth_token=HF_TOKEN)

config.yaml:   0%|          | 0.00/500 [00:00<?, ?B/s]

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


pytorch_model.bin:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/318 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.


hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml' -> '/root/.cache/torch/pyannote/speechbrain/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load_if_possible
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in /root/.cache/torch/pyann

embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt' -> '/root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt' -> '/root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt' -> '/root/.cache/torch/pyannote/speechbrain/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["classifier"] = /root/.cache/torch/pyannote/speechbrain/classifier.ckpt
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt' -> '/root/.cache/torch/pyannote/speechbrain/label_encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["label_encoder"] = /root/.cache/torch/pyannote/speechbrain/label_encoder.ckpt
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): embedding_model -> /root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): mean_var_norm_emb -> /root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): classifier -> /root/.cac

## 1 file

In [None]:
# convert mp3 to wav
audio_name = 'encounter_chest_pain'
input_path = f"/content/drive/MyDrive/ClinicalNotesGen/Data/audios/en/mp3/{audio_name}.mp3"
output_path = f"/content/drive/MyDrive/ClinicalNotesGen/Data/audios/en/wav/{audio_name}.wav"
wav_file_path = convert_to_wav(input_path, output_path)
print(f"WAV saved at: {wav_file_path}")

WAV saved at: /content/drive/MyDrive/ClinicalNotesGen/Data/audios/en/wav/encounter_chest_pain.wav


### Transcribe

In [None]:
def process_audio(audio_path):
    # Whisper
    tokens = transcribe_with_whisper(audio_path, model_whisper, "en")
    whisper_tokens_path = f"/content/drive/MyDrive/ClinicalNotesGen/Data/word_recognition/tokens_{audio_name}.json"
    save_to_json(tokens, whisper_tokens_path)

    # Pyannote
    segments = diarize_with_pyannote(audio_path, pipeline_diarization, device="cuda")
    pyannote_segments_path = f"/content/drive/MyDrive/ClinicalNotesGen/Data/speaker_recognition/segments_{audio_name}.json"
    save_to_json(segments, pyannote_segments_path)

    # Process
    diarized_tokens = assign_speakers(tokens, segments)
    raw_utterances = build_diarized_transcript(diarized_tokens)
    clean_utterances = merge_unk_into_next(raw_utterances)
    clean_utterances = add_utterance_ids(clean_utterances, "U")

    combine_transcript_path = f"/content/drive/MyDrive/ClinicalNotesGen/Data/transcripts/en/{audio_name}_transcript.json"
    save_to_json(clean_utterances, combine_transcript_path)
    return combine_transcript_path

In [None]:
audio_name = 'abdominal_pain_history'
audio_path = f"/content/drive/MyDrive/ClinicalNotesGen/Data/audios/en/wav/{audio_name}.wav"
combine_transcript_path = process_audio(audio_path)
diaglogue = load_from_json(combine_transcript_path)
diaglogue

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/root/.cache/torch/pyannote/speechbrain/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in /root/.cache/torch/pyannote/speechbrain.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /root/.cache/torch/pyannote/speechbrain/mean_var_norm_

[{'speaker': 'SPEAKER_00',
  'start': 0.88,
  'end': 7.72,
  'text': "Hi, my name's Grace. I'm one of the medical students in the A &E department. Do you mind just confirming your name and date of birth for me? Yeah, it's",
  'utterance_id': 'U1'},
 {'speaker': 'SPEAKER_01',
  'start': 7.72,
  'end': 12.08,
  'text': "Caroline Martin. It's the 24th of April, 85. Nice",
  'utterance_id': 'U2'},
 {'speaker': 'SPEAKER_00',
  'start': 12.08,
  'end': 15.2,
  'text': 'to meet you, Caroline. How can I help you today? Yeah,',
  'utterance_id': 'U3'},
 {'speaker': 'SPEAKER_01',
  'start': 15.36,
  'end': 24.86,
  'text': "I just had this sudden pain that started this morning, just after the school run. It was so severe, I've just come straight here. Oh,",
  'utterance_id': 'U4'},
 {'speaker': 'SPEAKER_00',
  'start': 24.88,
  'end': 28.18,
  'text': "I'm sorry to hear it. Have you had some painkillers? Luckily,",
  'utterance_id': 'U5'},
 {'speaker': 'SPEAKER_01',
  'start': 28.46,
  'end': 33

In [None]:
audio_name = 'sexual_health_history'
audio_path = f"/content/drive/MyDrive/ClinicalNotesGen/Data/audios/en/wav/{audio_name}.wav"
combine_transcript_path = process_audio(audio_path)
diaglogue = load_from_json(combine_transcript_path)
diaglogue

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/root/.cache/torch/pyannote/speechbrain/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in /root/.cache/torch/pyannote/speechbrain.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /root/.cache/torch/pyannote/speechbrain/mean_var_norm_

[{'speaker': 'SPEAKER_00',
  'start': 0.78,
  'end': 7.84,
  'text': "Hi, my name's Grace, I'm one of the medical students at the Sexual Health Clinic. Hello. Hi, could I just confirm your name and date of birth, please? Yeah,",
  'utterance_id': 'U1'},
 {'speaker': 'SPEAKER_01',
  'start': 7.9,
  'end': 11.9,
  'text': "it's Holly Burton, it's the 19th of February, 2003. Lovely,",
  'utterance_id': 'U2'},
 {'speaker': 'SPEAKER_00',
  'start': 12.0,
  'end': 49.0,
  'text': "nice to meet you, Holly. Thank you. Before we start, I'm just going to chat to you a bit about what we do here at the Sexual Health Clinic. So, I want to let you know that everything we discuss today is confidential within the healthcare team. Unless you say something that makes me think that there's a risk to you or to other people, then I might have to break confidentiality, but I would always talk to you about that first. Does that make sense? Yeah. And the other thing is, because you're in a Sexual Health Clini

In [None]:
audio_name = 'type_2_diabetes'
audio_path = f"/content/drive/MyDrive/ClinicalNotesGen/Data/audios/en/wav/{audio_name}.wav"
combine_transcript_path = process_audio(audio_path)
diaglogue = load_from_json(combine_transcript_path)
diaglogue

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/root/.cache/torch/pyannote/speechbrain/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in /root/.cache/torch/pyannote/speechbrain.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /root/.cache/torch/pyannote/speechbrain/mean_var_norm_

[{'speaker': 'SPEAKER_01',
  'start': 1.58,
  'end': 8.62,
  'text': "Hi there, my name's Leah, I'm one of the junior doctors working in the GP surgery. Is it okay if I just check your name and date of birth please? Yeah,",
  'utterance_id': 'U1'},
 {'speaker': 'SPEAKER_00',
  'start': 8.62,
  'end': 16.98,
  'text': "so it's Camilla, Camilla Weldon, and it's the 3rd of May 1977. Nice to meet you. Is it okay if I call you Camilla today? Yeah, of course. Fabulous.",
  'utterance_id': 'U2'},
 {'speaker': 'SPEAKER_01',
  'start': 17.46,
  'end': 19.12,
  'text': 'So how can I help you today Camilla?',
  'utterance_id': 'U3'},
 {'speaker': 'SPEAKER_00',
  'start': 19.58,
  'end': 37.56,
  'text': 'Yeah, so the doctor I saw last week, he rang me yesterday to say that I had some blood tests and he just said that the blood tests said I had diabetes. So it was just to come and have a chat to you about, really about that. Okay,',
  'utterance_id': 'U4'},
 {'speaker': 'SPEAKER_01',
  'start': 37

### Format conversation

In [None]:
format_conversation(diaglogue)

[00:11s–00:16s] SPEAKER_00: So, what's new, Mark? How is your new job going?
[00:17s–00:24s] SPEAKER_01: To be honest, I can't complain. I really love the company that I am working for.
[00:25s–00:32s] SPEAKER_01: My co-workers are all really friendly and helpful. They really help me feel welcome.
[00:33s–00:36s] SPEAKER_01: It's a really energetic and fun atmosphere.
[00:38s–00:39s] SPEAKER_01: My boss is hilarious.
[00:40s–00:42s] SPEAKER_01: And he's really flexible.
[00:43s–00:46s] SPEAKER_01: Really? How so?
[00:47s–00:52s] SPEAKER_01: He allows me to come in when I want and make my own hours.
[00:53s–01:08s] SPEAKER_01: I can also leave early if I start early. There is no real dress code either. I can wear jeans and a t-shirt if I want. I can even wear shorts in the summer. Wow.
[01:09s–01:11s] SPEAKER_00: It sounds really cool.
[01:12s–01:15s] SPEAKER_00: I can't stand wearing a suit every day.
[01:16s–01:17s] SPEAKER_00: Which do you prefer?
[01:18s–01:20s] SPEAKER_00: Working 

In [None]:
format_conversation_2(diaglogue)


SPEAKER_00:
So, what's new, Mark? How is your new job going?

SPEAKER_01:
To be honest, I can't complain. I really love the company that I am working for.
My co-workers are all really friendly and helpful. They really help me feel welcome.
It's a really energetic and fun atmosphere.
My boss is hilarious.
And he's really flexible.
Really? How so?
He allows me to come in when I want and make my own hours.
I can also leave early if I start early. There is no real dress code either. I can wear jeans and a t-shirt if I want. I can even wear shorts in the summer. Wow.

SPEAKER_00:
It sounds really cool.
I can't stand wearing a suit every day.
Which do you prefer?
Working late or finishing early?

SPEAKER_01:
I prefer finishing early. I really enjoy the morning. I love getting up early and going for a run.
There's nothing like watching the sunrise while drinking my morning coffee.

SPEAKER_00:
Really? I am opposite.
I love sleeping in.
I am most alert in the evenings.
I'm a real night owl.

