# Benchmark between `MSDD + Titanet` and `MSDD + WavLMBasePlus`

#### *The benchmark was performed using 10 randomly selected samples with a total duration of 60 seconds from the AMI Corpus Dataset.*

## Download


In [None]:
!mkdir -p .data
!wget -O .data/ami_manual_1.6.1.tar.gz https://us.openslr.org/resources/16/ami_manual_1.6.1.tar.gz
!wget -O .data/Array1-01.tar.gz https://www.openslr.org/resources/16/Array1-01.tar.gz

## Install Libraries & Packages

In [None]:
%%capture
!pip install pydub
!pip install wavlmmsdd
!pip install nemo_toolkit['asr']
!sudo apt update && sudo apt install ffmpeg -y

## Imports

In [12]:
import os
import json
import glob
import torch
import random
import tarfile
from pydub import AudioSegment
from omegaconf import OmegaConf
import xml.etree.ElementTree as ET
from wavlmmsdd.audio.feature.embedding import WavLMSV
from wavlmmsdd.audio.diarization.diarize import Diarizer
from nemo.collections.asr.models.msdd_models import NeuralDiarizer

## Paths


In [13]:
root_dir = "."

config_path = "../src/wavlmmsdd/audio/config/diar_infer_telephonic.yaml"
manifest_path = ".data/manifest.json"
out_dir = ".data"

tar_path_ami_manual = ".data/ami_manual_1.6.1.tar.gz"
tar_path_ami_array1 = ".data/Array1-01.tar.gz"

ami_manual_extract_to = ".data/ami_manual_1.6.1"
ami_array1_extract_to = ".data/Array1-01"

segments_dir = ".data/ami_manual_1.6.1/segments"
wav_root_dir = ".data/Array1-01"

## Configuration


In [14]:
cfg = OmegaConf.load(config_path)
cfg.diarizer.manifest_filepath = manifest_path
cfg.diarizer.out_dir = out_dir
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Methods

In [15]:
def extract(archive_path, extract_to):
    with tarfile.open(archive_path, "r:gz") as tar:
        tar.extractall(path=extract_to)
    print("The archive '{}' has been successfully extracted to '{}'.".format(archive_path, extract_to))

def convert_segments_xml_to_rttm(segments_directory, wav_root_directory):
    rttm_data = {}

    pattern = os.path.join(segments_directory, "*.segments.xml")
    for xml_file in glob.glob(pattern):
        filename = os.path.basename(xml_file).replace(".segments.xml", "")
        split_parts = filename.split(".")
        if len(split_parts) < 2:
            continue
        meeting_id, speaker_label = split_parts[0], split_parts[1]

        tree = ET.parse(xml_file)
        root = tree.getroot()

        for seg in root.findall("segment"):
            start_str = seg.get("transcriber_start", "0.0")
            end_str   = seg.get("transcriber_end", "0.0")
            try:
                start = float(start_str)
                end   = float(end_str)
            except ValueError:
                continue
            duration = end - start
            if duration <= 0:
                continue

            if meeting_id not in rttm_data:
                rttm_data[meeting_id] = []
            rttm_data[meeting_id].append((start, duration, speaker_label))

    for meeting_id, segments in rttm_data.items():
        wav_pattern = os.path.join(wav_root_directory, '**', f'{meeting_id}*.wav')
        wav_files = glob.glob(wav_pattern, recursive=True)
        if not wav_files:
            print(f"Warning: No WAV file found for {meeting_id}. RTTM file could not be created.")
            continue

        wav_dir = os.path.dirname(wav_files[0])
        output_path = os.path.join(wav_dir, f"{meeting_id}.rttm")
        segments.sort(key=lambda x: x[0])

        with open(output_path, "w", encoding="utf-8") as f:
            for (start, dur, spk) in segments:
                line = f"SPEAKER {meeting_id} 1 {start:.3f} {dur:.3f} <NA> <NA> {spk} <NA> <NA>\n"
                f.write(line)
        print(f"Written: {output_path} ({len(segments)} segments)")

def convert_wavs_to_mono(base_dir):
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.lower().endswith(".wav"):
                filepath = os.path.join(root, file)
                try:
                    audio = AudioSegment.from_wav(filepath)
                    if audio.channels > 1:
                        print(f"Converting to mono: {filepath}")
                        mono_audio = audio.set_channels(1)
                        mono_audio.export(filepath, format="wav")
                    else:
                        print(f"Already mono: {filepath}")
                except Exception as e:
                    print(f"Error occurred: {filepath} - {e}")

def create_manifest_with_random_n_sample():
    wav_root_directory = wav_root_dir
    manifest_directory = manifest_path
    duration = 60.0

    all_wav_files = glob.glob(os.path.join(wav_root_directory, "**", "*.wav"), recursive=True)

    if len(all_wav_files) < 10:
        raise ValueError(f"Found only {len(all_wav_files)} .wav files, need at least 10.")

    selected_wavs = random.sample(all_wav_files, 10)

    with open(manifest_directory, "w", encoding="utf-8") as f:
        for wav_file in selected_wavs:
            filename = os.path.basename(wav_file)

            meeting_id = filename.split(".")[0]

            rttm_file = os.path.join(os.path.dirname(wav_file), f"{meeting_id}.rttm")

            entry = {
                "audio_filepath": wav_file,
                "offset": 0.0,
                "duration": duration,
                "rttm_filepath": rttm_file
            }
            f.write(json.dumps(entry) + "\n")

    print(f"Manifest created with 10 entries: {manifest_path}")

## Extract TAR

In [None]:
extract(tar_path_ami_manual, ami_manual_extract_to)
extract(tar_path_ami_array1, ami_array1_extract_to)

## Convert XML to RTTM


In [16]:
convert_segments_xml_to_rttm(segments_dir, wav_root_dir)

Written: .data/Array1-01/TS3011b/audio/TS3011b.rttm (544 segments)
Written: .data/Array1-01/IS1008a/audio/IS1008a.rttm (176 segments)
Written: .data/Array1-01/IS1007b/audio/IS1007b.rttm (372 segments)
Written: .data/Array1-01/IS1007c/audio/IS1007c.rttm (465 segments)
Written: .data/Array1-01/IS1005c/audio/IS1005c.rttm (406 segments)
Written: .data/Array1-01/TS3007a/audio/TS3007a.rttm (472 segments)
Written: .data/Array1-01/ES2016b/audio/ES2016b.rttm (392 segments)
Written: .data/Array1-01/TS3007c/audio/TS3007c.rttm (699 segments)
Written: .data/Array1-01/ES2009d/audio/ES2009d.rttm (694 segments)
Written: .data/Array1-01/IN1001/audio/IN1001.rttm (775 segments)
Written: .data/Array1-01/EN2009c/audio/EN2009c.rttm (618 segments)
Written: .data/Array1-01/ES2008a/audio/ES2008a.rttm (194 segments)
Written: .data/Array1-01/IS1002c/audio/IS1002c.rttm (502 segments)
Written: .data/Array1-01/TS3006d/audio/TS3006d.rttm (1263 segments)
Written: .data/Array1-01/IS1009d/audio/IS1009d.rttm (537 segmen

## Convert to Mono

In [18]:
convert_wavs_to_mono(wav_root_dir)

Already mono: .data/Array1-01/ES2002b/audio/ES2002b.Array1-01.wav
Already mono: .data/Array1-01/ES2005d/audio/ES2005d.Array1-01.wav
Already mono: .data/Array1-01/IS1000a/audio/IS1000a.Array1-01.wav
Already mono: .data/Array1-01/TS3010c/audio/TS3010c.Array1-01.wav
Already mono: .data/Array1-01/ES2011a/audio/ES2011a.Array1-01.wav
Already mono: .data/Array1-01/ES2006d/audio/ES2006d.Array1-01.wav
Already mono: .data/Array1-01/IS1001b/audio/IS1001b.Array1-01.wav
Already mono: .data/Array1-01/ES2011d/audio/ES2011d.Array1-01.wav
Already mono: .data/Array1-01/ES2010c/audio/ES2010c.Array1-01.wav
Already mono: .data/Array1-01/TS3003c/audio/TS3003c.Array1-01.wav
Already mono: .data/Array1-01/IB4005/audio/IB4005.Array1-01.wav
Already mono: .data/Array1-01/ES2010d/audio/ES2010d.Array1-01.wav
Already mono: .data/Array1-01/TS3005b/audio/TS3005b.Array1-01.wav
Already mono: .data/Array1-01/ES2015d/audio/ES2015d.Array1-01.wav
Already mono: .data/Array1-01/TS3004d/audio/TS3004d.Array1-01.wav
Already mono

## Random 10 Sample with 60 Second

In [19]:
create_manifest_with_random_n_sample()

Manifest created with 10 entries: .data/manifest.json


## Diarization `MSDD + Titanet`

In [20]:
diar_model = NeuralDiarizer(cfg=cfg)
diar_model.diarize()

[NeMo I 2025-02-14 19:36:33 nemo_logging:393] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2025-02-14 19:36:33 nemo_logging:393] Found existing object /home/bunyamin/.cache/torch/NeMo/NeMo_2.1.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2025-02-14 19:36:33 nemo_logging:393] Re-using file from: /home/bunyamin/.cache/torch/NeMo/NeMo_2.1.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo
[NeMo I 2025-02-14 19:36:33 nemo_logging:393] Instantiating model from pre-trained checkpoint


[NeMo W 2025-02-14 19:36:34 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2025-02-14 19:36:34 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    
[NeMo W 2025-02-14 19:36:34 nemo_logging:405] Please call the ModelPT.setup_test_data() or ModelPT

[NeMo I 2025-02-14 19:36:34 nemo_logging:393] PADDING: 16
[NeMo I 2025-02-14 19:36:34 nemo_logging:393] PADDING: 16
[NeMo I 2025-02-14 19:36:34 nemo_logging:393] Model EncDecDiarLabelModel was successfully restored from /home/bunyamin/.cache/torch/NeMo/NeMo_2.1.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2025-02-14 19:36:34 nemo_logging:393] PADDING: 16
[NeMo I 2025-02-14 19:36:35 nemo_logging:393] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2025-02-14 19:36:35 nemo_logging:393] Found existing object /home/bunyamin/.cache/torch/NeMo/NeMo_2.1.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2025-02-14 19:36:35 nemo_logging:393] Re-using file from: /home/bunyamin/.cache/torch/NeMo/NeMo_2.1.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2025-02-14 19:36:35 nemo_logging:393] Instantiating model from pre-trained 

[NeMo W 2025-02-14 19:36:35 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy

[NeMo I 2025-02-14 19:36:35 nemo_logging:393] PADDING: 16
[NeMo I 2025-02-14 19:36:35 nemo_logging:393] Model EncDecClassificationModel was successfully restored from /home/bunyamin/.cache/torch/NeMo/NeMo_2.1.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2025-02-14 19:36:35 nemo_logging:393] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2025-02-14 19:36:35 nemo_logging:393] Clustering Parameters: {
        "oracle_num_speakers": false,
        "max_num_speakers": 8,
        "enhanced_count_thres": 80,
        "max_rp_threshold": 0.25,
        "sparse_search_volume": 30,
        "maj_vote_spk_count": false,
        "chunk_cluster_count": 50,
        "embeddings_per_chunk": 10000
    }


[NeMo W 2025-02-14 19:36:35 nemo_logging:405] Deleting previous clustering diarizer outputs.


[NeMo I 2025-02-14 19:36:35 nemo_logging:393] Number of files to diarize: 10
[NeMo I 2025-02-14 19:36:35 nemo_logging:393] Split long audio file to avoid CUDA memory issue


splitting manifest: 100%|██████████| 10/10 [00:00<00:00, 261.81it/s]

[NeMo I 2025-02-14 19:36:35 nemo_logging:393] The prepared manifest file exists. Overwriting!
[NeMo I 2025-02-14 19:36:35 nemo_logging:393] Perform streaming frame-level VAD
[NeMo I 2025-02-14 19:36:35 nemo_logging:393] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-14 19:36:35 nemo_logging:393] Dataset successfully loaded with 20 items and total duration provided from manifest is  0.17 hours.
[NeMo I 2025-02-14 19:36:35 nemo_logging:393] # 20 files loaded accounting to # 1 labels



vad: 100%|██████████| 20/20 [00:01<00:00, 17.76it/s]

[NeMo I 2025-02-14 19:36:36 nemo_logging:393] Generating predictions with overlapping input segments



                                                                 

[NeMo I 2025-02-14 19:36:38 nemo_logging:393] Converting frame level prediction to speech/no-speech segment in start and end times format.


creating speech segments: 100%|██████████| 10/10 [00:00<00:00, 50.80it/s]


[NeMo I 2025-02-14 19:36:38 nemo_logging:393] Subsegmentation for embedding extraction: scale0, .data/speaker_outputs/subsegments_scale0.json
[NeMo I 2025-02-14 19:36:38 nemo_logging:393] Extracting embeddings for Diarization
[NeMo I 2025-02-14 19:36:38 nemo_logging:393] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-14 19:36:38 nemo_logging:393] Dataset successfully loaded with 286 items and total duration provided from manifest is  0.07 hours.
[NeMo I 2025-02-14 19:36:38 nemo_logging:393] # 286 files loaded accounting to # 1 labels


[1/5] extract embeddings: 100%|██████████| 5/5 [02:10<00:00, 26.00s/it]

[NeMo I 2025-02-14 19:38:48 nemo_logging:393] Saved embedding files to .data/speaker_outputs/embeddings
[NeMo I 2025-02-14 19:38:48 nemo_logging:393] Subsegmentation for embedding extraction: scale1, .data/speaker_outputs/subsegments_scale1.json
[NeMo I 2025-02-14 19:38:48 nemo_logging:393] Extracting embeddings for Diarization
[NeMo I 2025-02-14 19:38:48 nemo_logging:393] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-14 19:38:48 nemo_logging:393] Dataset successfully loaded with 318 items and total duration provided from manifest is  0.07 hours.
[NeMo I 2025-02-14 19:38:48 nemo_logging:393] # 318 files loaded accounting to # 1 labels



[2/5] extract embeddings: 100%|██████████| 5/5 [02:23<00:00, 28.76s/it]

[NeMo I 2025-02-14 19:41:12 nemo_logging:393] Saved embedding files to .data/speaker_outputs/embeddings
[NeMo I 2025-02-14 19:41:12 nemo_logging:393] Subsegmentation for embedding extraction: scale2, .data/speaker_outputs/subsegments_scale2.json
[NeMo I 2025-02-14 19:41:12 nemo_logging:393] Extracting embeddings for Diarization
[NeMo I 2025-02-14 19:41:12 nemo_logging:393] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-14 19:41:12 nemo_logging:393] Dataset successfully loaded with 377 items and total duration provided from manifest is  0.08 hours.
[NeMo I 2025-02-14 19:41:12 nemo_logging:393] # 377 files loaded accounting to # 1 labels



[3/5] extract embeddings: 100%|██████████| 6/6 [01:55<00:00, 19.20s/it]

[NeMo I 2025-02-14 19:43:07 nemo_logging:393] Saved embedding files to .data/speaker_outputs/embeddings
[NeMo I 2025-02-14 19:43:07 nemo_logging:393] Subsegmentation for embedding extraction: scale3, .data/speaker_outputs/subsegments_scale3.json
[NeMo I 2025-02-14 19:43:07 nemo_logging:393] Extracting embeddings for Diarization
[NeMo I 2025-02-14 19:43:07 nemo_logging:393] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-14 19:43:07 nemo_logging:393] Dataset successfully loaded with 479 items and total duration provided from manifest is  0.08 hours.
[NeMo I 2025-02-14 19:43:07 nemo_logging:393] # 479 files loaded accounting to # 1 labels



[4/5] extract embeddings: 100%|██████████| 8/8 [01:47<00:00, 13.39s/it]

[NeMo I 2025-02-14 19:44:54 nemo_logging:393] Saved embedding files to .data/speaker_outputs/embeddings
[NeMo I 2025-02-14 19:44:54 nemo_logging:393] Subsegmentation for embedding extraction: scale4, .data/speaker_outputs/subsegments_scale4.json
[NeMo I 2025-02-14 19:44:54 nemo_logging:393] Extracting embeddings for Diarization
[NeMo I 2025-02-14 19:44:54 nemo_logging:393] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-14 19:44:54 nemo_logging:393] Dataset successfully loaded with 717 items and total duration provided from manifest is  0.09 hours.
[NeMo I 2025-02-14 19:44:54 nemo_logging:393] # 717 files loaded accounting to # 1 labels



[5/5] extract embeddings: 100%|██████████| 12/12 [02:14<00:00, 11.21s/it]

[NeMo I 2025-02-14 19:47:09 nemo_logging:393] Saved embedding files to .data/speaker_outputs/embeddings



[NeMo W 2025-02-14 19:47:09 nemo_logging:405] cuda=False, using CPU for eigen decomposition. This might slow down the clustering process.
clustering: 100%|██████████| 10/10 [00:02<00:00,  3.96it/s]

[NeMo I 2025-02-14 19:47:12 nemo_logging:393] Outputs are saved in /home/bunyamin/Desktop/BE_PC/Work/2.Project/7.WavLMMSDD/Dev/WavLMMSDD/notebook/.data directory
[NeMo I 2025-02-14 19:47:12 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1



    


[NeMo I 2025-02-14 19:47:12 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:12 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:12 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:12 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:12 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:12 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 5, Hyp: 1
[NeMo I 2025-02-14 19:47:13 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:13 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:13 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:13 nemo_logging:393] 

100%|██████████| 1/1 [00:00<00:00,  6.41it/s]

[NeMo I 2025-02-14 19:47:14 nemo_logging:393]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2025-02-14 19:47:14 nemo_logging:393] Number of files to diarize: 10
[NeMo I 2025-02-14 19:47:14 nemo_logging:393] Number of files to diarize: 10





[NeMo I 2025-02-14 19:47:14 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1


    


[NeMo I 2025-02-14 19:47:14 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:14 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:14 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:14 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:15 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:15 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 5, Hyp: 1
[NeMo I 2025-02-14 19:47:15 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:15 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:16 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:16 nemo_logging:393] 

    


[NeMo I 2025-02-14 19:47:16 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:17 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:17 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:17 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:17 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:17 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 5, Hyp: 1
[NeMo I 2025-02-14 19:47:18 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:18 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:18 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:18 nemo_logging:393] 

    


[NeMo I 2025-02-14 19:47:19 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:19 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:19 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:19 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:19 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:19 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 5, Hyp: 1
[NeMo I 2025-02-14 19:47:19 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:20 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:20 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:20 nemo_logging:393] 

[[(<pyannote.metrics.diarization.DiarizationErrorRate at 0x7e2671505d90>,
   {'TS3009c.Array1-01': {'speaker_0': 'A'},
    'IS1000b.Array1-01': {},
    'IS1004a.Array1-01': {'speaker_0': 'A'},
    'IS1008a.Array1-01': {'speaker_0': 'A'},
    'ES2005c.Array1-01': {'speaker_0': 'C'},
    'TS3010d.Array1-01': {'speaker_0': 'A'},
    'EN2001e.Array1-01': {'speaker_0': 'B'},
    'TS3004d.Array1-01': {'speaker_0': 'A'},
    'ES2009c.Array1-01': {'speaker_0': 'A'},
    'IS1002c.Array1-01': {'speaker_0': 'D'}},
   (0.9928440937670906,
    0.0012406631840040318,
    0.0023107155245803844,
    0.9892927150585061)),
  (<pyannote.metrics.diarization.DiarizationErrorRate at 0x7e27d3ef62a0>,
   {'TS3009c.Array1-01': {'speaker_0': 'A'},
    'IS1000b.Array1-01': {},
    'IS1004a.Array1-01': {'speaker_0': 'A'},
    'IS1008a.Array1-01': {'speaker_0': 'A'},
    'ES2005c.Array1-01': {'speaker_0': 'C'},
    'TS3010d.Array1-01': {'speaker_0': 'A'},
    'EN2001e.Array1-01': {'speaker_0': 'B'},
    'TS3004d.A

## Diarization `MSDD + WavLMBasePlus`


In [21]:
embedder = WavLMSV()

diarizer = Diarizer(embedding=embedder, manifest_path=manifest_path)
diarizer.run()

[INFO] Loading XVector model: microsoft/wavlm-base-plus-sv on device: cuda
[INFO] XVector dimension: 512
[NeMo I 2025-02-14 19:47:27 nemo_logging:393] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2025-02-14 19:47:27 nemo_logging:393] Found existing object /home/bunyamin/.cache/torch/NeMo/NeMo_2.1.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2025-02-14 19:47:27 nemo_logging:393] Re-using file from: /home/bunyamin/.cache/torch/NeMo/NeMo_2.1.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2025-02-14 19:47:27 nemo_logging:393] Instantiating model from pre-trained checkpoint


[NeMo W 2025-02-14 19:47:27 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy

[NeMo I 2025-02-14 19:47:27 nemo_logging:393] PADDING: 16
[NeMo I 2025-02-14 19:47:27 nemo_logging:393] Model EncDecClassificationModel was successfully restored from /home/bunyamin/.cache/torch/NeMo/NeMo_2.1.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.


[NeMo W 2025-02-14 19:47:27 nemo_logging:405] requested None model name not available in pretrained models, instead


[NeMo I 2025-02-14 19:47:27 nemo_logging:393] Loading pretrained ecapa_tdnn model from NGC
[NeMo I 2025-02-14 19:47:27 nemo_logging:393] Found existing object /home/bunyamin/.cache/torch/NeMo/NeMo_2.1.0/ecapa_tdnn/3e0c5c4731b176aeb70c29a74d800c81/ecapa_tdnn.nemo.
[NeMo I 2025-02-14 19:47:27 nemo_logging:393] Re-using file from: /home/bunyamin/.cache/torch/NeMo/NeMo_2.1.0/ecapa_tdnn/3e0c5c4731b176aeb70c29a74d800c81/ecapa_tdnn.nemo
[NeMo I 2025-02-14 19:47:27 nemo_logging:393] Instantiating model from pre-trained checkpoint


[NeMo W 2025-02-14 19:47:28 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    time_length: 3
    augmentor:
      noise:
        manifest_path: /manifests/noise/rir_noise_manifest.json
        prob: 0.5
        min_snr_db: 0
        max_snr_db: 15
      speed:
        prob: 0.5
        sr: 16000
        resample_type: kaiser_fast
        min_speed_rate: 0.95
        max_speed_rate: 1.05
    num_workers: 15
    pin_memory: true
    
[NeMo W 2025-02-14 19:47:28 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validati

[NeMo I 2025-02-14 19:47:28 nemo_logging:393] PADDING: 16
[NeMo I 2025-02-14 19:47:28 nemo_logging:393] Model EncDecSpeakerLabelModel was successfully restored from /home/bunyamin/.cache/torch/NeMo/NeMo_2.1.0/ecapa_tdnn/3e0c5c4731b176aeb70c29a74d800c81/ecapa_tdnn.nemo.
[NeMo I 2025-02-14 19:47:28 nemo_logging:393] Number of files to diarize: 10
[NeMo I 2025-02-14 19:47:28 nemo_logging:393] Split long audio file to avoid CUDA memory issue


splitting manifest: 100%|██████████| 10/10 [00:00<00:00, 407.63it/s]

[NeMo I 2025-02-14 19:47:28 nemo_logging:393] Perform streaming frame-level VAD
[NeMo I 2025-02-14 19:47:28 nemo_logging:393] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-14 19:47:28 nemo_logging:393] Dataset successfully loaded with 20 items and total duration provided from manifest is  0.17 hours.
[NeMo I 2025-02-14 19:47:28 nemo_logging:393] # 20 files loaded accounting to # 1 labels



vad: 100%|██████████| 20/20 [00:01<00:00, 16.60it/s]

[NeMo I 2025-02-14 19:47:29 nemo_logging:393] Generating predictions with overlapping input segments



                                                                 

[NeMo I 2025-02-14 19:47:32 nemo_logging:393] Converting frame level prediction to speech/no-speech segment in start and end times format.


creating speech segments: 100%|██████████| 10/10 [00:00<00:00, 46.85it/s]

[NeMo I 2025-02-14 19:47:32 nemo_logging:393] Subsegmentation for embedding extraction: scale0, .temp/speaker_outputs/subsegments_scale0.json
[NeMo I 2025-02-14 19:47:32 nemo_logging:393] Extracting embeddings for Diarization
[NeMo I 2025-02-14 19:47:32 nemo_logging:393] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-14 19:47:32 nemo_logging:393] Dataset successfully loaded with 286 items and total duration provided from manifest is  0.07 hours.
[NeMo I 2025-02-14 19:47:32 nemo_logging:393] # 286 files loaded accounting to # 1 labels



[1/5] extract embeddings: 100%|██████████| 5/5 [00:00<00:00,  9.17it/s]

[NeMo I 2025-02-14 19:47:33 nemo_logging:393] Saved embedding files to .temp/speaker_outputs/embeddings
[NeMo I 2025-02-14 19:47:33 nemo_logging:393] Subsegmentation for embedding extraction: scale1, .temp/speaker_outputs/subsegments_scale1.json
[NeMo I 2025-02-14 19:47:33 nemo_logging:393] Extracting embeddings for Diarization
[NeMo I 2025-02-14 19:47:33 nemo_logging:393] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-14 19:47:33 nemo_logging:393] Dataset successfully loaded with 318 items and total duration provided from manifest is  0.07 hours.
[NeMo I 2025-02-14 19:47:33 nemo_logging:393] # 318 files loaded accounting to # 1 labels



[2/5] extract embeddings: 100%|██████████| 5/5 [00:00<00:00, 11.96it/s]

[NeMo I 2025-02-14 19:47:33 nemo_logging:393] Saved embedding files to .temp/speaker_outputs/embeddings
[NeMo I 2025-02-14 19:47:33 nemo_logging:393] Subsegmentation for embedding extraction: scale2, .temp/speaker_outputs/subsegments_scale2.json
[NeMo I 2025-02-14 19:47:33 nemo_logging:393] Extracting embeddings for Diarization
[NeMo I 2025-02-14 19:47:33 nemo_logging:393] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-14 19:47:33 nemo_logging:393] Dataset successfully loaded with 377 items and total duration provided from manifest is  0.08 hours.
[NeMo I 2025-02-14 19:47:33 nemo_logging:393] # 377 files loaded accounting to # 1 labels



[3/5] extract embeddings: 100%|██████████| 6/6 [00:00<00:00, 12.68it/s]

[NeMo I 2025-02-14 19:47:34 nemo_logging:393] Saved embedding files to .temp/speaker_outputs/embeddings
[NeMo I 2025-02-14 19:47:34 nemo_logging:393] Subsegmentation for embedding extraction: scale3, .temp/speaker_outputs/subsegments_scale3.json
[NeMo I 2025-02-14 19:47:34 nemo_logging:393] Extracting embeddings for Diarization
[NeMo I 2025-02-14 19:47:34 nemo_logging:393] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-14 19:47:34 nemo_logging:393] Dataset successfully loaded with 479 items and total duration provided from manifest is  0.08 hours.
[NeMo I 2025-02-14 19:47:34 nemo_logging:393] # 479 files loaded accounting to # 1 labels



[4/5] extract embeddings: 100%|██████████| 8/8 [00:00<00:00, 16.47it/s]

[NeMo I 2025-02-14 19:47:34 nemo_logging:393] Saved embedding files to .temp/speaker_outputs/embeddings
[NeMo I 2025-02-14 19:47:34 nemo_logging:393] Subsegmentation for embedding extraction: scale4, .temp/speaker_outputs/subsegments_scale4.json
[NeMo I 2025-02-14 19:47:34 nemo_logging:393] Extracting embeddings for Diarization
[NeMo I 2025-02-14 19:47:34 nemo_logging:393] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-14 19:47:34 nemo_logging:393] Dataset successfully loaded with 717 items and total duration provided from manifest is  0.09 hours.
[NeMo I 2025-02-14 19:47:34 nemo_logging:393] # 717 files loaded accounting to # 1 labels



[5/5] extract embeddings: 100%|██████████| 12/12 [00:00<00:00, 21.43it/s]


[NeMo I 2025-02-14 19:47:35 nemo_logging:393] Saved embedding files to .temp/speaker_outputs/embeddings


clustering: 100%|██████████| 10/10 [00:01<00:00,  6.69it/s]

[NeMo I 2025-02-14 19:47:36 nemo_logging:393] Outputs are saved in /home/bunyamin/Desktop/BE_PC/Work/2.Project/7.WavLMMSDD/Dev/WavLMMSDD/notebook/.temp directory
[NeMo I 2025-02-14 19:47:36 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1



    


[NeMo I 2025-02-14 19:47:37 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:37 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:37 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:37 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:37 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 2
[NeMo I 2025-02-14 19:47:37 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 5, Hyp: 1
[NeMo I 2025-02-14 19:47:38 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:38 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 1
[NeMo I 2025-02-14 19:47:38 nemo_logging:393] Wrong Spk. Count with uniq_id:....Array1-01, Ref: 4, Hyp: 3
[NeMo I 2025-02-14 19:47:38 nemo_logging:393] 