In [3]:
!pip install pyannote.audio

Collecting pyannote.audio
  Downloading pyannote_audio-4.0.4-py3-none-any.whl.metadata (13 kB)
Collecting asteroid-filterbanks>=0.4.0 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.4 (from pyannote.audio)
  Downloading lightning-2.6.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting opentelemetry-exporter-otlp>=1.34.0 (from pyannote.audio)
  Downloading opentelemetry_exporter_otlp-1.39.1-py3-none-any.whl.metadata (2.4 kB)
Collecting pyannote-core>=6.0.1 (from pyannote.audio)
  Downloading pyannote_core-6.0.1-py3-none-any.whl.metadata (1.9 kB)
Collecting pyannote-database>=6.1.1 (from pyannote.audio)
  Downloading pyannote_database-6.1.1-py3-none-any.whl.metadata (30 kB)
Collecting pyannote-metrics>=4.0.0 (from pyannote.audio)
  Downloading pyannote_metrics-4.0.0-py3-none-any.whl.metadata (2.2 kB)
C

In [8]:
!apt-get update && apt-get install -y libsndfile1 ffmpeg
!pip install Cython packaging
!pip install 'git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]'


0% [Working]            Get:1 https://cli.github.com/packages stable InRelease [3,917 B]
0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connecting to security.ub                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connecting to security.ub0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connecting to security.ub                                                                               Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [85.0 kB]
Get:5 https://cli.github.com/packages stable/main amd64 Packages [357 B]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,378 kB]
Get:7 http://security.ubuntu.com/ubuntu jammy-security InRel

In [7]:


import os
import csv
import time
import torch
import logging
from pyannote.core import Annotation, Segment
from pyannote.metrics.diarization import DiarizationErrorRate

# --- 1. IMPORTATION PROPRE DE NEMO ---
logging.getLogger("nemo_logger").setLevel(logging.ERROR)
from nemo.collections.asr.models import SortformerEncLabelModel

# --- 2. CONFIGURATION ---
AUDIO_FOLDER = "/content/drive/MyDrive/GroundTruth_clean_no_overlap/audioMono"
GT_RTTM_FOLDER = "/content/drive/MyDrive/GroundTruth_clean_no_overlap/rttm"

OUTPUT_DIR = "results_nemo_sortformer"
CSV_OUTPUT_PATH = os.path.join(OUTPUT_DIR, "resultats_nemo_sortformer.csv")

os.makedirs(OUTPUT_DIR, exist_ok=True)

COLLAR = 0.25
SKIP_OVERLAP = False

# --- 3. CHARGEMENT DU MODÈLE ---
print("⏳ Chargement du modèle NVIDIA Sortformer...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🖥️ Appareil de calcul détecté : {device}")

try:
    model = SortformerEncLabelModel.from_pretrained(model_name="nvidia/diar_streaming_sortformer_4spk-v2")
    model.to(device)
    model.eval()
    print("✅ Modèle chargé avec succès.\n")
except Exception as e:
    print(f"❌ Erreur lors du chargement : {e}")
    exit()

# --- 4. FONCTIONS UTILITAIRES ---
def load_rttm_file(file_path):
    """Charge un fichier RTTM et le convertit en objet Pyannote."""
    annotation = Annotation()
    if not os.path.exists(file_path):
        return annotation
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 8 and parts[0] == 'SPEAKER':
                start = float(parts[3])
                duration = float(parts[4])
                label = parts[7]
                annotation[Segment(start, start + duration)] = label
    return annotation

def parse_sortformer_output(nemo_output):
    """Convertit la sortie texte (RTTM) de Sortformer v2 en objet Pyannote."""
    annotation = Annotation()

    # NeMo renvoie souvent un tuple ou une liste pour le batch. On extrait le texte.
    if isinstance(nemo_output, tuple):
        nemo_output = nemo_output[0]
    if isinstance(nemo_output, list) and len(nemo_output) > 0:
        nemo_output = nemo_output[0]

    # Si la sortie est bien le texte brut format RTTM
    if isinstance(nemo_output, str):
        lines = nemo_output.strip().split('\n')
        for line in lines:
            if not line.strip():
                continue
            parts = line.strip().split()
            # On vérifie qu'il s'agit bien d'une ligne RTTM valide
            if len(parts) >= 8 and parts[0] == 'SPEAKER':
                start = float(parts[3])
                duration = float(parts[4])
                speaker = parts[7]
                annotation[Segment(start, start + duration)] = speaker

    return annotation

# --- 5. INFÉRENCE ET ÉVALUATION ---
audio_files = sorted([f for f in os.listdir(AUDIO_FOLDER) if f.endswith('.wav')])

if not audio_files:
    print("❌ Aucun fichier audio (.wav) trouvé !")
    exit()

print(f"🚀 Lancement de la diarisation sur {len(audio_files)} fichiers...")
print("-" * 90)
print(f"{'FICHIER':<30} | {'DER (%)':<10} | {'VRAI':<5} | {'PRED':<5} | {'DIFF':<5} | {'TEMPS (s)'}")
print("-" * 90)

der_metric = DiarizationErrorRate(collar=COLLAR, skip_overlap=SKIP_OVERLAP)
results_data = []

for filename in audio_files:
    audio_path = os.path.join(AUDIO_FOLDER, filename)
    file_id = os.path.splitext(filename)[0]
    gt_path = os.path.join(GT_RTTM_FOLDER, file_id + ".rttm")

    if not os.path.exists(gt_path):
        continue

    # 1. Charger la vérité terrain
    reference = load_rttm_file(gt_path)

    # 2. Inférence Sortformer (Avec mesure du temps)
    start_time = time.time()
    try:
        raw_prediction = model.diarize(audio=[audio_path], batch_size=1)

        # --- BLOC DE DÉBOGAGE (À SUPPRIMER PLUS TARD) ---
        if filename == audio_files[0]: # On ne l'affiche que pour le premier fichier pour ne pas polluer
            print("\n" + "="*50)
            print(f"🕵️ DEBUG TYPE : {type(raw_prediction)}")
            print(f"🕵️ DEBUG CONTENU : {raw_prediction}")
            print("="*50 + "\n")
        # ------------------------------------------------

        hypothesis = parse_sortformer_output(raw_prediction)
    except Exception as e:
        print(f"❌ Erreur inférence sur {filename}: {e}")
        continue

    # 3. Métriques
    ref_end = reference.get_timeline().extent().end if reference else 0
    hyp_end = hypothesis.get_timeline().extent().end if hypothesis else 0
    uem_end = max(ref_end, hyp_end)
    uem_end = uem_end if uem_end > 0 else 1.0

    try:
        der_val = der_metric(reference, hypothesis, uem=Segment(0, uem_end)) * 100
    except:
        der_val = 100.0

    true_s = len(reference.labels())
    pred_s = len(hypothesis.labels())
    diff = pred_s - true_s

    results_data.append({
        'Filename': filename,
        'DER': der_val,
        'True_Speakers': true_s,
        'Pred_Speakers': pred_s,
        'Diff': diff,
        'Time_sec': infer_time
    })

    print(f"{filename:<30} | {der_val:>7.2f} % | {true_s:<5} | {pred_s:<5} | {diff:>4d} | {infer_time:>6.2f}s")

# --- 6. EXPORTATION DES RÉSULTATS ---
if results_data:
    print("\n" + "="*50)
    global_der = abs(der_metric) * 100
    print(f"🏆 DER MOYEN GLOBAL : {global_der:.2f} %")

    with open(CSV_OUTPUT_PATH, mode='w', newline='') as csv_file:
        fieldnames = ['Filename', 'DER', 'True_Speakers', 'Pred_Speakers', 'Diff', 'Time_sec']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for data in results_data:
            writer.writerow({
                'Filename': data['Filename'],
                'DER': f"{data['DER']:.4f}",
                'True_Speakers': data['True_Speakers'],
                'Pred_Speakers': data['Pred_Speakers'],
                'Diff': data['Diff'],
                'Time_sec': f"{data['Time_sec']:.2f}"
            })
    print(f"📁 Fichier sauvegardé : {CSV_OUTPUT_PATH}")

⏳ Chargement du modèle NVIDIA Sortformer...
🖥️ Appareil de calcul détecté : cuda


[NeMo W 2026-02-24 13:21:00 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


✅ Modèle chargé avec succès.

🚀 Lancement de la diarisation sur 60 fichiers...
------------------------------------------------------------------------------------------
FICHIER                        | DER (%)    | VRAI  | PRED  | DIFF  | TEMPS (s)
------------------------------------------------------------------------------------------


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 17.20it/s]
Diarizing: 1it [00:00,  4.57it/s]
[NeMo W 2026-02-24 13:21:00 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec



🕵️ DEBUG TYPE : <class 'list'>
🕵️ DEBUG CONTENU : [['0.320 3.600 speaker_0', '9.600 12.080 speaker_0', '15.040 17.360 speaker_0', '18.000 18.640 speaker_0', '21.360 24.160 speaker_0', '24.800 25.030 speaker_0', '6.080 8.400 speaker_1', '12.720 15.040 speaker_1', '18.880 21.520 speaker_1']]

clean_audio_000.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 19.15it/s]
Diarizing: 1it [00:00,  5.29it/s]
[NeMo W 2026-02-24 13:21:00 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_001.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 18.91it/s]
Diarizing: 1it [00:00,  5.37it/s]
[NeMo W 2026-02-24 13:21:01 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_002.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 16.65it/s]
Diarizing: 1it [00:00,  5.02it/s]
[NeMo W 2026-02-24 13:21:01 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_003.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 21.27it/s]
Diarizing: 1it [00:00,  5.78it/s]


clean_audio_004.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:01 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 21.90it/s]
Diarizing: 1it [00:00,  5.62it/s]
[NeMo W 2026-02-24 13:21:01 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_005.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 21.77it/s]
Diarizing: 1it [00:00,  5.66it/s]
[NeMo W 2026-02-24 13:21:02 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_006.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 18.18it/s]
Diarizing: 1it [00:00,  5.29it/s]
[NeMo W 2026-02-24 13:21:02 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_007.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.46it/s]
Diarizing: 1it [00:00,  6.07it/s]


clean_audio_008.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s


[NeMo W 2026-02-24 13:21:02 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.92it/s]
Diarizing: 1it [00:00,  6.14it/s]


clean_audio_009.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:02 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 13.59it/s]
Diarizing: 1it [00:00,  3.86it/s]
[NeMo W 2026-02-24 13:21:02 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_010.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00,  7.58it/s]
Diarizing: 1it [00:00,  2.34it/s]
[NeMo W 2026-02-24 13:21:03 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_011.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 17.31it/s]
Diarizing: 1it [00:00,  4.55it/s]
[NeMo W 2026-02-24 13:21:03 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_012.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 18.25it/s]
Diarizing: 1it [00:00,  4.68it/s]
[NeMo W 2026-02-24 13:21:03 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_013.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 14.52it/s]
Diarizing: 1it [00:00,  3.97it/s]
[NeMo W 2026-02-24 13:21:04 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_014.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 13.75it/s]
Diarizing: 1it [00:00,  3.68it/s]
[NeMo W 2026-02-24 13:21:04 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_015.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 17.46it/s]
Diarizing: 1it [00:00,  4.54it/s]
[NeMo W 2026-02-24 13:21:04 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_016.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 17.36it/s]
Diarizing: 1it [00:00,  4.46it/s]
[NeMo W 2026-02-24 13:21:05 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_017.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 16.90it/s]
Diarizing: 1it [00:00,  4.23it/s]
[NeMo W 2026-02-24 13:21:05 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_018.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 16.99it/s]
Diarizing: 1it [00:00,  4.29it/s]
[NeMo W 2026-02-24 13:21:05 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_019.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 15.70it/s]
Diarizing: 1it [00:00,  4.04it/s]
[NeMo W 2026-02-24 13:21:06 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_020.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 12.37it/s]
Diarizing: 1it [00:00,  3.49it/s]
[NeMo W 2026-02-24 13:21:06 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_021.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 17.22it/s]
Diarizing: 1it [00:00,  5.06it/s]
[NeMo W 2026-02-24 13:21:06 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_022.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.83it/s]
Diarizing: 1it [00:00,  6.07it/s]


clean_audio_023.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:06 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.01it/s]
Diarizing: 1it [00:00,  6.03it/s]


clean_audio_024.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:06 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 21.06it/s]
Diarizing: 1it [00:00,  5.80it/s]


clean_audio_025.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:07 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.69it/s]
Diarizing: 1it [00:00,  6.06it/s]


clean_audio_026.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:07 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 18.88it/s]
Diarizing: 1it [00:00,  5.41it/s]
[NeMo W 2026-02-24 13:21:07 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


clean_audio_027.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.62it/s]
Diarizing: 1it [00:00,  5.96it/s]


clean_audio_028.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:07 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.82it/s]
Diarizing: 1it [00:00,  6.13it/s]


clean_audio_029.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:07 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 20.73it/s]
Diarizing: 1it [00:00,  5.73it/s]
[NeMo W 2026-02-24 13:21:08 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


gold_stereo_000.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.74it/s]
Diarizing: 1it [00:00,  6.02it/s]


gold_stereo_001.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s


[NeMo W 2026-02-24 13:21:08 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps:   0%|          | 0/2 [00:00<?, ?it/s][A
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 19.45it/s]
Diarizing: 1it [00:00,  5.54it/s]
[NeMo W 2026-02-24 13:21:08 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


gold_stereo_002.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.24it/s]
Diarizing: 1it [00:00,  6.02it/s]


gold_stereo_003.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s


[NeMo W 2026-02-24 13:21:08 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 23.02it/s]
Diarizing: 1it [00:00,  6.14it/s]


gold_stereo_004.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:09 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.25it/s]
Diarizing: 1it [00:00,  6.12it/s]


gold_stereo_005.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:09 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.73it/s]
Diarizing: 1it [00:00,  6.14it/s]


gold_stereo_006.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:09 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 21.43it/s]
Diarizing: 1it [00:00,  5.52it/s]
[NeMo W 2026-02-24 13:21:09 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


gold_stereo_007.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.34it/s]
Diarizing: 1it [00:00,  5.97it/s]


gold_stereo_008.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:09 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.72it/s]
Diarizing: 1it [00:00,  6.06it/s]


gold_stereo_009.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:10 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.06it/s]
Diarizing: 1it [00:00,  5.93it/s]
[NeMo W 2026-02-24 13:21:10 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


gold_stereo_010.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 21.58it/s]
Diarizing: 1it [00:00,  5.72it/s]
[NeMo W 2026-02-24 13:21:10 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


gold_stereo_011.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 21.88it/s]
Diarizing: 1it [00:00,  5.50it/s]
[NeMo W 2026-02-24 13:21:10 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


gold_stereo_012.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.31it/s]
Diarizing: 1it [00:00,  5.95it/s]


gold_stereo_013.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:10 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.32it/s]
Diarizing: 1it [00:00,  6.08it/s]


gold_stereo_014.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:11 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 21.28it/s]
Diarizing: 1it [00:00,  5.83it/s]


gold_stereo_015.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:11 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.05it/s]
Diarizing: 1it [00:00,  5.91it/s]

gold_stereo_016.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s



[NeMo W 2026-02-24 13:21:11 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 20.75it/s]
Diarizing: 1it [00:00,  5.30it/s]
[NeMo W 2026-02-24 13:21:11 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


gold_stereo_017.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 20.14it/s]
Diarizing: 1it [00:00,  5.67it/s]
[NeMo W 2026-02-24 13:21:11 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


gold_stereo_018.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 21.45it/s]
Diarizing: 1it [00:00,  5.86it/s]


gold_stereo_019.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:12 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 21.43it/s]
Diarizing: 1it [00:00,  5.78it/s]


gold_stereo_020.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:12 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 21.54it/s]
Diarizing: 1it [00:00,  5.78it/s]


gold_stereo_021.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:12 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.54it/s]
Diarizing: 1it [00:00,  5.42it/s]
[NeMo W 2026-02-24 13:21:12 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


gold_stereo_022.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 21.54it/s]
Diarizing: 1it [00:00,  5.81it/s]
[NeMo W 2026-02-24 13:21:12 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


gold_stereo_023.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 21.96it/s]
Diarizing: 1it [00:00,  5.38it/s]
[NeMo W 2026-02-24 13:21:13 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


gold_stereo_024.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.00it/s]
Diarizing: 1it [00:00,  5.85it/s]

gold_stereo_025.wav            |  100.00 % | 1     | 0     |   -1 |   0.21s



[NeMo W 2026-02-24 13:21:13 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.96it/s]
Diarizing: 1it [00:00,  6.06it/s]


gold_stereo_026.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:13 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.11it/s]
Diarizing: 1it [00:00,  5.55it/s]
[NeMo W 2026-02-24 13:21:13 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec


gold_stereo_027.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.39it/s]
Diarizing: 1it [00:00,  6.07it/s]


gold_stereo_028.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s


[NeMo W 2026-02-24 13:21:13 dataloader:826] The following configuration keys are ignored by Lhotse dataloader: soft_label_thres,num_spks,session_len_sec
Diarizing: 0it [00:00, ?it/s]
Streaming Steps: 100%|██████████| 2/2 [00:00<00:00, 22.51it/s]
Diarizing: 1it [00:00,  6.09it/s]


gold_stereo_029.wav            |  100.00 % | 2     | 0     |   -2 |   0.21s

🏆 DER MOYEN GLOBAL : 100.00 %
📁 Fichier sauvegardé : results_nemo_sortformer/resultats_nemo_sortformer.csv


In [5]:
from huggingface_hub import notebook_login
notebook_login()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from nemo.collections.asr.models import SortformerEncLabelModel

# load model from Hugging Face model card directly (You need a Hugging Face token)
diar_model = SortformerEncLabelModel.from_pretrained("nvidia/diar_streaming_sortformer_4spk-v2")


      m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
    
      m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
    
      elif re.match('(flt)p?( \(default\))?$', token):
    
      elif re.match('(dbl)p?( \(default\))?$', token):
    


diar_streaming_sortformer_4spk-v2.nemo:   0%|          | 0.00/471M [00:00<?, ?B/s]

[NeMo W 2026-02-24 13:04:14 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    sample_rate: 16000
    num_spks: 4
    session_len_sec: 90
    soft_label_thres: 0.5
    soft_targets: false
    labels: null
    batch_size: 4
    shuffle: true
    num_workers: 18
    validation_mode: false
    use_lhotse: false
    use_bucketing: false
    pin_memory: true
    window_stride: 0.01
    subsampling_factor: 8
    
[NeMo W 2026-02-24 13:04:14 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    is_tarred: false
    tarred_audio_filepaths: null
    sample_rate: 16000
    num_spks: 4
    sess

[NeMo I 2026-02-24 13:04:17 save_restore_connector:286] Model SortformerEncLabelModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--diar_streaming_sortformer_4spk-v2/snapshots/6dbf0d69730bfee097056692b86525a0a23b32f9/diar_streaming_sortformer_4spk-v2.nemo.
