In [None]:
!pip install numpy==1.26.4 --quiet
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121 --quiet
!pip install pyannote-core==5.0.0 pyannote-audio==3.1.1 pyannote-metrics==3.2.1 --quiet
!pip install ffmpeg-python --quiet

In [None]:
import os
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

CLEAN_AUDIO_PATH = "audios/code_switch_cleaned.wav"

os.makedirs("diarization_output", exist_ok=True)
os.makedirs("segments", exist_ok=True)

print("Using cleaned audio:", CLEAN_AUDIO_PATH)

Using device: cuda
Using cleaned audio: audios/code_switch_cleaned.wav


In [None]:
from pyannote.audio import Pipeline
import torch

HUGGINGFACE_TOKEN = ""

# pyannote/speaker-diarization (For English)
# pyannote/speaker-diarization-3.1 (For Arabic)
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization",
    use_auth_token=HUGGINGFACE_TOKEN,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline.to(device)

diarization = pipeline(CLEAN_AUDIO_PATH, num_speakers=2)
print("Diarization complete.")

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.6.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/root/.cache/torch/pyannote/speechbrain/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.1+cu121. Bad things might happen unless you revert torch to 1.x.


DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in /root/.cache/torch/pyannote/speechbrain.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/root/.cache/torch/pyannote/speechbrain/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["classifier"] = /root/.cache/torch/pyannote/speech

Diarization complete.


In [None]:
segments = []

for turn, _, speaker in diarization.itertracks(yield_label=True):
    segments.append({
        "speaker": speaker,
        "start": float(turn.start),
        "end": float(turn.end)
    })

print("Number of raw diarized segments:", len(segments))
segments[0:]

Number of raw diarized segments: 14


[{'speaker': 'SPEAKER_01',
  'start': 0.4863481228668942,
  'end': 3.4215017064846416},
 {'speaker': 'SPEAKER_00',
  'start': 3.7286689419795227,
  'end': 7.431740614334471},
 {'speaker': 'SPEAKER_00',
  'start': 8.694539249146757,
  'end': 14.786689419795222},
 {'speaker': 'SPEAKER_00',
  'start': 15.759385665529011,
  'end': 17.994880546075084},
 {'speaker': 'SPEAKER_00',
  'start': 18.592150170648466,
  'end': 23.063139931740615},
 {'speaker': 'SPEAKER_00',
  'start': 24.39419795221843,
  'end': 24.428327645051194},
 {'speaker': 'SPEAKER_01',
  'start': 24.428327645051194,
  'end': 29.974402730375427},
 {'speaker': 'SPEAKER_01',
  'start': 30.6740614334471,
  'end': 30.998293515358363},
 {'speaker': 'SPEAKER_00',
  'start': 30.691126279863482,
  'end': 36.578498293515366},
 {'speaker': 'SPEAKER_01',
  'start': 38.50682593856655,
  'end': 44.496587030716725},
 {'speaker': 'SPEAKER_00',
  'start': 44.496587030716725,
  'end': 44.56484641638225},
 {'speaker': 'SPEAKER_00',
  'start': 4

In [None]:
def clean_diarization(diarization, gap_threshold=0.5, min_duration=0.4):
    merged_segments = []

    for segment in diarization.itertracks(yield_label=True):
        start, end, speaker = segment[0].start, segment[0].end, segment[2]

        if merged_segments and merged_segments[-1]['speaker'] == speaker:
            if start - merged_segments[-1]['end'] <= gap_threshold:
                merged_segments[-1]['end'] = end
            else:
                merged_segments.append({'speaker': speaker, 'start': start, 'end': end})
        else:
            merged_segments.append({'speaker': speaker, 'start': start, 'end': end})

    cleaned_segments = [
        seg for seg in merged_segments
        if seg['end'] - seg['start'] >= min_duration
    ]

    if cleaned_segments:
        first_speaker = cleaned_segments[0]['speaker']
        if first_speaker != 'SPEAKER_00':
            for seg in cleaned_segments:
                if seg['speaker'] == 'SPEAKER_00':
                    seg['speaker'] = 'SPEAKER_01'
                else:
                    seg['speaker'] = 'SPEAKER_00'

    return cleaned_segments

cleaned_segments = clean_diarization(diarization)

print("Number of cleaned segments:", len(cleaned_segments))
for seg in cleaned_segments:
    print(seg)

Number of cleaned segments: 11
{'speaker': 'SPEAKER_00', 'start': 0.4863481228668942, 'end': 3.4215017064846416}
{'speaker': 'SPEAKER_01', 'start': 3.7286689419795227, 'end': 7.431740614334471}
{'speaker': 'SPEAKER_01', 'start': 8.694539249146757, 'end': 14.786689419795222}
{'speaker': 'SPEAKER_01', 'start': 15.759385665529011, 'end': 17.994880546075084}
{'speaker': 'SPEAKER_01', 'start': 18.592150170648466, 'end': 23.063139931740615}
{'speaker': 'SPEAKER_00', 'start': 24.428327645051194, 'end': 29.974402730375427}
{'speaker': 'SPEAKER_01', 'start': 30.691126279863482, 'end': 36.578498293515366}
{'speaker': 'SPEAKER_00', 'start': 38.50682593856655, 'end': 44.496587030716725}
{'speaker': 'SPEAKER_01', 'start': 45.21331058020478, 'end': 50.65699658703072}
{'speaker': 'SPEAKER_01', 'start': 51.970989761092156, 'end': 54.496587030716725}
{'speaker': 'SPEAKER_00', 'start': 54.496587030716725, 'end': 55.02559726962457}


In [None]:
import json

json_path = "diarization_output/code_switch.json"

with open(json_path, "w") as f:
    json.dump(cleaned_segments, f, indent=4)

print("Saved diarization JSON →", json_path)

Saved diarization JSON → diarization_output/model_2.1/code_switch.json


In [None]:
import ffmpeg

for i, seg in enumerate(segments):
    start = seg["start"]
    end = seg["end"]
    speaker = seg["speaker"]

    out_path = f"segments/code_switch_segments/seg_{i}_{speaker}.wav"

    (
        ffmpeg
        .input(CLEAN_AUDIO_PATH, ss=start, to=end)
        .output(out_path, ac=1, ar=16000)
        .overwrite_output()
        .run(quiet=True)
    )

print("All segments saved to /segments folder.")

All segments saved to /segments folder.
