# Tune VAD: Segment Duration Visualization

Use this notebook to tune Silero VAD parameters and visualize the distribution of segment durations.

- Input: a local audio/video file.
- Output: inline duration series plot; optional timestamps-only WebVTT saved beside the input (\`.vad.vtt\`).

Notes:

- The code relies on modules in `src/gigacan` (FFmpeg is required).
- Prefer running in a virtualenv with deps from `requirements.txt`.


In [None]:
%matplotlib inline
import os, sys, tempfile
from statistics import mean, median

# Add src/ to import path so the local package works
CURRENT_DIR = os.path.dirname(os.path.abspath("__file__")) if '__file__' in globals() else os.getcwd()
SRC_DIR = os.path.join(CURRENT_DIR, 'src')
if SRC_DIR not in sys.path: sys.path.insert(0, SRC_DIR)

from gigacan import segmenter, transcriber

AUDIO_EXTS = ('.mp3', '.wav', '.opus', '.m4a', '.mp4', '.webm', '.flv', '.mkv')

def list_media(folder: str):
    try:
        if not os.path.isabs(folder):
            folder = os.path.join(os.getcwd(), folder)
        if not os.path.isdir(folder):
            return []
        files = [os.path.join(folder, f) for f in sorted(os.listdir(folder))
                 if os.path.isfile(os.path.join(folder, f)) and f.lower().endswith(AUDIO_EXTS)]
        return files
    except Exception:
        return []

def process(input_path: str, *, max_seg_seconds: float = 0.0, vad_merge_ms: int = 450, min_speech_ms: int = 200, save_vtt: bool = True):
    """Run VAD and prepare entries with duration as text. Returns (segs, durations, entries, out_vtt)."""
    segmenter.check_ffmpeg()
    if not os.path.isfile(input_path):
        raise FileNotFoundError(f'Input not found: {input_path}')
    with tempfile.TemporaryDirectory() as tmpdir:
        wav_path = os.path.join(tmpdir, 'audio_16k_mono.wav')
        segmenter.extract_mono_wav(input_path, wav_path, sr=16000)
        total_dur = segmenter.ffprobe_duration_seconds(wav_path)
        if total_dur <= 0:
            raise RuntimeError('Could not determine media duration.')
        segs = segmenter.try_silero_vad_segments(
            wav_path,
            max_seg_s=float(max_seg_seconds),
            merge_gap_ms=int(vad_merge_ms),
            min_speech_ms=int(min_speech_ms),
        )
        if not segs:
            fallback_size = float(max_seg_seconds) if float(max_seg_seconds) > 0 else 30.0
            segs = segmenter.fixed_window_segments(total_dur, max_seg_s=fallback_size)
        durations = [max(0.0, e - s) for (s, e) in segs]
        entries = [(s, e, f'{(max(0.0, e - s)):.3f}') for (s, e) in segs]
        out_vtt = None
        if save_vtt:
            base, _ = os.path.splitext(input_path)
            out_vtt = base + '.vad.vtt'
            transcriber.write_webvtt(out_vtt, entries)
        return segs, durations, entries, out_vtt

def plot_duration_series(durations, *, title: str = 'Segment durations (s)'):
    import matplotlib.pyplot as plt
    plt.figure(figsize=(9, 4.5))
    if durations:
        xs = list(range(1, len(durations) + 1))
        plt.plot(xs, durations, '-o', markersize=3, linewidth=1.2, color='#4C78A8')
        m = mean(durations)
        med = median(durations)
        plt.axhline(m, color='#E45756', linestyle='--', linewidth=1.2, label=f'mean={m:.2f}s')
        plt.axhline(med, color='#F2CF5B', linestyle=':', linewidth=1.2, label=f'median={med:.2f}s')
        plt.legend(loc='upper right')
    plt.xlabel('Segment index (1-based)')
    plt.ylabel('Duration (seconds)')
    plt.title(title)
    plt.grid(True, linestyle=':', linewidth=0.5, alpha=0.6)
    plt.tight_layout()
    plt.show()


In [None]:
# Interactive controls (requires ipywidgets).
try:
    import ipywidgets as widgets
    from IPython.display import display, Markdown
    HAS_W = True
except Exception:
    HAS_W = False

if HAS_W:
    folder = widgets.Text(value='download/', description='Folder', layout=widgets.Layout(width='50%'))
    files = widgets.Dropdown(options=list_media(folder.value) or [''], description='File', layout=widgets.Layout(width='80%'))

    def _on_folder_change(c):
        files.options = list_media(c['new']) or ['']
    folder.observe(_on_folder_change, names='value')

    max_seg = widgets.FloatText(value=0.0, description='Max seg (s)', step=1.0)
    vad_merge = widgets.IntSlider(value=450, min=0, max=2000, step=50, description='Merge (ms)')
    min_speech = widgets.IntSlider(value=200, min=0, max=2000, step=50, description='Min speech (ms)')
    save_vtt_cb = widgets.Checkbox(value=True, description='Save VTT')

    run_btn = widgets.Button(description='Run VAD', button_style='primary')
    out_area = widgets.Output()

    def _run(_):
        with out_area:
            out_area.clear_output()
            path = files.value
            if not path or not os.path.isfile(path):
                display(Markdown('**Input not found** — select a file.'))
                return
            try:
                segs, durations, entries, out_vtt = process(
                    path,
                    max_seg_seconds=max_seg.value,
                    vad_merge_ms=vad_merge.value,
                    min_speech_ms=min_speech.value,
                    save_vtt=save_vtt_cb.value,
                )
            except Exception as e:
                display(Markdown(f'**Error:** {e}'))
                return
            display(Markdown(f'Segments: **{len(segs)}** — mean={mean(durations):.2f}s, median={median(durations):.2f}s'))
            plot_duration_series(durations, title=f'merge={vad_merge.value}ms, min_speech={min_speech.value}ms, max_seg={max_seg.value:.1f}s')
            if out_vtt:
                display(Markdown(f'Saved VTT: `{out_vtt}`'))

    run_btn.on_click(_run)
    display(widgets.VBox([
        widgets.HBox([folder, files]),
        widgets.HBox([max_seg, vad_merge, min_speech, save_vtt_cb]),
        run_btn,
        out_area
    ]))
else:
    print('ipywidgets not installed — using manual parameters below. You can still run the next cell.')

In [None]:
# Manual parameters (used when ipywidgets is unavailable).
# Set the path to your media file and run this cell.
input_path = 'download/'  # e.g., 'download/001.opus'
max_seg_seconds = 0.0    # 0 disables splitting after merge
vad_merge_ms = 100       # merge pauses shorter than this (ms)
min_speech_ms = 200      # minimum speech duration to keep (ms)
save_vtt = True          # write <base>.vad.vtt beside the input

if 'ipywidgets' not in sys.modules:
    try:
        segs, durations, entries, out_vtt = process(
            input_path,
            max_seg_seconds=max_seg_seconds,
            vad_merge_ms=vad_merge_ms,
            min_speech_ms=min_speech_ms,
            save_vtt=save_vtt,
        )
        print(f'Segments: {len(segs)} — mean={mean(durations):.2f}s, median={median(durations):.2f}s')
        plot_duration_series(durations, title=f'merge={vad_merge_ms}ms, min_speech={min_speech_ms}ms, max_seg={max_seg_seconds:.1f}s')
        if out_vtt:
            print('Saved VTT:', out_vtt)
    except Exception as e:
        print('Error:', e)