# pyannote/speaker-diarization-3.1

In [None]:
import os
from pyannote.audio import Pipeline
import torch
from tqdm import tqdm

# 1. Khởi tạo pipeline diarization với token HF
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token="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"
).to(torch.device("cuda"))

# 2. Gốc thư mục chứa dữ liệu
root_folder = "/ASV/dataset/train_eval/data"

# 3. Thu thập tất cả các file .wav trong các thư mục con
wav_files = []
for dirpath, _, filenames in os.walk(root_folder):
    for fname in filenames:
        if fname.lower().endswith(".wav"):
            wav_files.append(os.path.join(dirpath, fname))
wav_files.sort()

# 5. Xử lý từng file với tqdm
output_path = "speaker-diarization-result.txt"
with open(output_path, "w") as f:
    for wav_path in tqdm(wav_files, desc="Processing audio files"):
        try:
            # 5.1 Chạy diarization
            diarization = pipeline(wav_path)
            # 5.2 Thu thập speaker turn
            speakers = set()
            for turn, _, speaker in diarization.itertracks(yield_label=True):
                speakers.add(speaker)

            f.write(f"{wav_path} {len(speakers)}\n")
        except Exception as e:
            fname = os.path.basename(wav_path)
            print(f"Lỗi khi xử lý {fname}: {e}")

print(f"\nĐã lưu kết quả vào {output_path}")

It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)
Processing audio files:   7%|▋         | 6033/87139 [06:40<2:20:12,  9.64it/s] 

# pyannote/segmentation-3.0

In [None]:
import os
from pyannote.audio import Model
from pyannote.audio.pipelines import OverlappedSpeechDetection

# 1. Load segmentation model
segmentation_model = Model.from_pretrained(
    "pyannote/segmentation-3.0",
    use_auth_token="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"
)

# 2. Khởi tạo pipeline OSD
osd_pipeline = OverlappedSpeechDetection(segmentation=segmentation_model)
HYPER_PARAMETERS = {
    # Loại bỏ vùng overlap quá ngắn (nếu cần)a
    "min_duration_on": 0.1,
    # Điền vùng không-overlap ngắn (nếu cần)
    "min_duration_off": 0.0
}
osd_pipeline.instantiate(HYPER_PARAMETERS)

# 3. Thư mục chứa các file .wav cần kiểm tra
spk_folder = "/ASV/dataset/train_eval/data/id00000"

# 4. Danh sách file có speech overlap (nhiều người nói)
multi_speaker_files = []

# 5. Duyệt qua các file .wav
for fname in os.listdir(spk_folder):
    if not fname.lower().endswith(".wav"):
        continue
    wav_path = os.path.join(spk_folder, fname)
    print(f"Processing {fname}...")

    # 6. Chạy Overlapped Speech Detection
    osd = osd_pipeline(wav_path)

    # 7. Nếu pipeline trả về bất kỳ segment overlap nào → file có multi-speaker
    if len(osd.get_timeline()) > 0:
        multi_speaker_files.append(wav_path)

# 8. In kết quả
print("\nFiles chứa chồng lấn (nhiều người nói cùng lúc):")
for f in multi_speaker_files:
    print(f)

# pyannote/overlapped-speech-detection

In [None]:
import os
from pyannote.audio import Pipeline

# 1. Thay YOUR_HF_TOKEN bằng token của bạn
HF_TOKEN = "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"

# 2. Khởi tạo pipeline Overlapped Speech Detection
pipeline = Pipeline.from_pretrained(
    "pyannote/overlapped-speech-detection",
    use_auth_token=HF_TOKEN
)

# 3. Thư mục chứa các file WAV cần kiểm tra
spk_folder = "/ASV/dataset/train_eval/data/id00000"

# 4. Danh sách file có overlapped speech
multi_speaker_files = []

# 5. Duyệt qua từng file WAV
for fname in os.listdir(spk_folder):
    if not fname.lower().endswith(".wav"):
        continue

    wav_path = os.path.join(spk_folder, fname)
    print(f"Processing {fname}...")

    # 6. Chạy pipeline
    output = pipeline(wav_path)

    # 7. Nếu có bất kỳ segment overlapped speech nào → thêm vào danh sách
    #    output.get_timeline() trả về Timeline các đoạn speech overlap
    if len(output.get_timeline()) > 0:
        multi_speaker_files.append(fname)

# 8. In kết quả
print("\nFiles chứa overlapped speech (≥2 speakers):")
for f in multi_speaker_files:
    print(f" - {f}")


In [None]:
import os
from pyannote.audio import Model
from pyannote.audio.pipelines import OverlappedSpeechDetection

# 1. Load segmentation model
segmentation_model = Model.from_pretrained(
    "pyannote/segmentation-3.0",
    use_auth_token="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"
).to("cuda")

# 2. Khởi tạo pipeline OSD với ngưỡng overlap ≥ 0.1s
osd_pipeline = OverlappedSpeechDetection(segmentation=segmentation_model)
HYPER_PARAMETERS = {
    "min_duration_on": 0.2,   # bỏ overlap ngắn dưới 100ms
    "min_duration_off": 0.0
}
osd_pipeline.instantiate(HYPER_PARAMETERS)

# 3. Thư mục gốc chứa mọi folder con
root_folder = "/ASV/dataset/train_eval/data"

# 4. Danh sách các file .wav có speech overlap
multi_speaker_files = []

# 5. Duyệt đệ quy toàn bộ cây thư mục
for dirpath, dirnames, filenames in os.walk(root_folder):
    for fname in filenames:
        if not fname.lower().endswith(".wav"):
            continue
        wav_path = os.path.join(dirpath, fname)
        print(f"Processing {wav_path}...")
        
        # 6. Chạy Overlapped Speech Detection
        osd = osd_pipeline(wav_path)

        # 7. Nếu có bất kỳ segment overlap nào → file multi-speaker
        if len(osd.get_timeline()) > 0:
            multi_speaker_files.append(wav_path)

# 8. In kết quả
print("\n=== Files chứa overlapped speech (≥2 speakers) ===")
for f in multi_speaker_files:
    print(f" - {f}")

In [None]:
# 9. Lưu kết quả vào file
multi_speaker_files.sort()
output_path = "multi_speaker_files.txt"
with open(output_path, "w") as f:
    for path in multi_speaker_files:
        f.write(f"{path}\n")

print(f"\nĐã lưu danh sách vào {output_path}")

In [None]:
asdas = osd_pipeline("/ASV/dataset/train_eval/data/id00002/00034.wav")

In [None]:
asdas.get_timeline().__len__()