# ðŸ”µ Cell 1 â€” Install Dependencies

In [None]:
!pip install numpy==1.26.4 --quiet
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121 --quiet
!pip install pyannote-core==5.0.0 pyannote-audio==3.1.1 pyannote-metrics==3.2.1 --quiet
!pip install ffmpeg-python --quiet

In [None]:
import numpy
import librosa
import torch
import torchvision
import torchaudio
import pyannote.audio

print("NumPy version:", numpy.__version__)
print("Librosa version:", librosa.__version__)
print("Torch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
print("Torchaudio version:", torchaudio.__version__)
print("Pyannote.audio version:", pyannote.audio.__version__)

NumPy version: 1.26.4
Librosa version: 0.11.0
Torch version: 2.3.1+cu121
Torchvision version: 0.18.1+cu121
Torchaudio version: 2.3.1+cu121
Pyannote.audio version: 3.1.1


# ðŸ”µ Cell 2 â€” Setup Paths & Create Folders

In [None]:
import os
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# EDIT THIS
CLEAN_AUDIO_PATH = "arabic_cleaned.wav"

# Output folders
os.makedirs("diarization_output", exist_ok=True)
os.makedirs("segments", exist_ok=True)

print("Using cleaned audio:", CLEAN_AUDIO_PATH)

Using device: cuda
Using cleaned audio: arabic_cleaned.wav


# ðŸ”µ Cell 3 â€” Run Pyannote Speaker Diarization

In [None]:
from pyannote.audio import Pipeline
import torch

# ADD TOKEN HERE
HUGGINGFACE_TOKEN = "TOKEN"

# pyannote/speaker-diarization (For English)
# pyannote/speaker-diarization-3.1 (For Arabic)
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=HUGGINGFACE_TOKEN,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline.to(device)

diarization = pipeline(CLEAN_AUDIO_PATH)
print("Diarization complete.")

config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

  std = sequences.std(dim=-1, correction=1)


Diarization complete.


# ðŸ”µ Cell 4 â€” Extract Segments & Clean them (start, end, speaker)

In [None]:
segments = []

for turn, _, speaker in diarization.itertracks(yield_label=True):
    segments.append({
        "speaker": speaker,
        "start": float(turn.start),
        "end": float(turn.end)
    })

print("Number of raw diarized segments:", len(segments))
segments[0:]

Number of raw diarized segments: 35


[{'speaker': 'SPEAKER_00',
  'start': 2.504244482173175,
  'end': 2.5551782682512734},
 {'speaker': 'SPEAKER_01',
  'start': 2.5551782682512734,
  'end': 2.826825127334465},
 {'speaker': 'SPEAKER_01',
  'start': 3.2512733446519526,
  'end': 4.422750424448218},
 {'speaker': 'SPEAKER_00',
  'start': 5.780984719864177,
  'end': 6.68081494057725},
 {'speaker': 'SPEAKER_00',
  'start': 7.224108658743633,
  'end': 9.634974533106961},
 {'speaker': 'SPEAKER_01',
  'start': 10.076400679117148,
  'end': 10.365025466893039},
 {'speaker': 'SPEAKER_01',
  'start': 10.415959252971138,
  'end': 12.02886247877759},
 {'speaker': 'SPEAKER_01',
  'start': 12.35144312393888,
  'end': 13.04753820033956},
 {'speaker': 'SPEAKER_01',
  'start': 13.33616298811545,
  'end': 15.916808149405773},
 {'speaker': 'SPEAKER_00',
  'start': 17.258064516129032,
  'end': 19.855687606112056},
 {'speaker': 'SPEAKER_01',
  'start': 20.38200339558574,
  'end': 20.687606112054333},
 {'speaker': 'SPEAKER_01',
  'start': 21.2308

In [None]:
# Clean and merge diarization segments directly from Pyannote
def clean_diarization(diarization, gap_threshold=0.5, min_duration=0.4):
    merged_segments = []

    for segment in diarization.itertracks(yield_label=True):
        start, end, speaker = segment[0].start, segment[0].end, segment[2]

        if merged_segments and merged_segments[-1]['speaker'] == speaker:
            if start - merged_segments[-1]['end'] <= gap_threshold:
                merged_segments[-1]['end'] = end
            else:
                merged_segments.append({'speaker': speaker, 'start': start, 'end': end})
        else:
            merged_segments.append({'speaker': speaker, 'start': start, 'end': end})

    # Remove very short segments
    cleaned_segments = [
        seg for seg in merged_segments
        if seg['end'] - seg['start'] >= min_duration
    ]

    # TRIAL FOR 2 SPEAKERS ONLY (Remove)
    # Make first speaker always SPEAKER_00
    if cleaned_segments:
        first_speaker = cleaned_segments[0]['speaker']
        if first_speaker != 'SPEAKER_00':
            for seg in cleaned_segments:
                if seg['speaker'] == 'SPEAKER_00':
                    seg['speaker'] = 'SPEAKER_01'
                else:
                    seg['speaker'] = 'SPEAKER_00'

    return cleaned_segments

# Usage
cleaned_segments = clean_diarization(diarization)

print("Number of cleaned segments:", len(cleaned_segments))
for seg in cleaned_segments:
    print(seg)

Number of cleaned segments: 17
{'speaker': 'SPEAKER_00', 'start': 2.5551782682512734, 'end': 4.422750424448218}
{'speaker': 'SPEAKER_01', 'start': 5.780984719864177, 'end': 6.68081494057725}
{'speaker': 'SPEAKER_01', 'start': 7.224108658743633, 'end': 9.634974533106961}
{'speaker': 'SPEAKER_00', 'start': 10.076400679117148, 'end': 15.916808149405773}
{'speaker': 'SPEAKER_01', 'start': 17.258064516129032, 'end': 19.855687606112056}
{'speaker': 'SPEAKER_00', 'start': 21.230899830220714, 'end': 24.830220713073004}
{'speaker': 'SPEAKER_01', 'start': 26.731748726655347, 'end': 27.920203735144312}
{'speaker': 'SPEAKER_00', 'start': 28.480475382003398, 'end': 30.687606112054333}
{'speaker': 'SPEAKER_01', 'start': 32.164685908319186, 'end': 36.188455008488965}
{'speaker': 'SPEAKER_00', 'start': 36.83361629881154, 'end': 41.40067911714771}
{'speaker': 'SPEAKER_01', 'start': 42.89473684210526, 'end': 46.035653650254666}
{'speaker': 'SPEAKER_00', 'start': 46.74872665534805, 'end': 48.344651952461

# ðŸ”µ Cell 5 â€” Save diarization as JSON

In [None]:
import json

json_path = "diarization_output/english.json"

with open(json_path, "w") as f:
    json.dump(cleaned_segments, f, indent=4)

print("Saved diarization JSON â†’", json_path)

Saved diarization JSON â†’ diarization_output/noise_code_switch.json


# ðŸ”µ Cell 6 â€” Cut Segments into Separate WAV Files

In [None]:
import ffmpeg

for i, seg in enumerate(segments):
    start = seg["start"]
    end = seg["end"]
    speaker = seg["speaker"]

    out_path = f"segments/seg_{i}_{speaker}.wav"

    (
        ffmpeg
        .input(CLEAN_AUDIO_PATH, ss=start, to=end)
        .output(out_path, ac=1, ar=16000)
        .overwrite_output()
        .run(quiet=True)
    )

print("All segments saved to /segments folder.")

All segments saved to /segments folder.
