# Step 3 · Cut Audio Segments

This notebook consumes VTT subtitles produced in step 2 and slices the matching audio in `download/` to produce segment clips in `cut/` together with a HuggingFace-ready `metadata.csv`.


## How to Use

1. Update the configuration cell with dataset metadata, input/output folders, and naming preferences.
2. Run the helper definitions cell to load utilities for parsing VTT cues and invoking ffmpeg.
3. Execute the processing cell to cut clips and regenerate `cut/metadata.csv`. Rerun after adjusting parameters.


In [3]:
from __future__ import annotations

import csv
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import List

FFMPEG_LOGLEVEL = "error"


@dataclass
class Segment:
    start: float
    end: float
    text: str


class ProcessingError(RuntimeError):
    """Raised when ffmpeg cannot produce the expected segment."""


def check_executable(binary: str) -> None:
    """Ensure the required binary is available before processing."""
    try:
        subprocess.run([binary, "-version"], capture_output=True, check=True)
    except (subprocess.CalledProcessError, FileNotFoundError) as exc:
        raise FileNotFoundError(
            f"Executable '{binary}' is required but was not found. Install ffmpeg and expose it on PATH."
        ) from exc


def parse_timestamp(value: str) -> float:
    """Convert a WebVTT timestamp (HH:MM:SS.mmm) to seconds."""
    clean = value.replace(",", ".").strip()
    hours, minutes, seconds = clean.split(":")
    return (int(hours) * 3600) + (int(minutes) * 60) + float(seconds)


def parse_vtt_segments(vtt_path: Path) -> List[Segment]:
    """Extract ordered subtitle segments from a VTT file."""
    lines = vtt_path.read_text(encoding="utf-8-sig").splitlines()
    segments: List[Segment] = []
    start = end = None
    buffer: List[str] = []
    for raw in lines:
        line = raw.strip()
        if not line:
            if start is not None and end is not None and buffer:
                text = " ".join(part.strip() for part in buffer if part.strip())
                if text:
                    segments.append(Segment(start=float(start), end=float(end), text=text))
            start = end = None
            buffer = []
            continue
        if line.startswith("WEBVTT"):
            continue
        if "-->" in line:
            parts = [frag.strip() for frag in line.split("-->")]
            if len(parts) != 2:
                continue
            start = parse_timestamp(parts[0])
            end = parse_timestamp(parts[1].split()[0])
            buffer = []
            continue
        if start is not None:
            buffer.append(line)
    if start is not None and end is not None and buffer:
        text = " ".join(part.strip() for part in buffer if part.strip())
        if text:
            segments.append(Segment(start=float(start), end=float(end), text=text))
    return segments


def slice_audio(
    ffmpeg_binary: str,
    input_audio: Path,
    output_audio: Path,
    start: float,
    end: float,
    audio_codec: str = "copy",
    overwrite: bool = False,
) -> None:
    """Slice a portion of audio using ffmpeg with an optional re-encode fallback."""
    duration = max(end - start, 0.0)
    if duration <= 0.0:
        raise ProcessingError(f"Invalid cue with non-positive duration: {start} → {end}")

    output_audio.parent.mkdir(parents=True, exist_ok=True)
    if output_audio.exists() and not overwrite:
        return

    base_cmd = [
        ffmpeg_binary,
        "-hide_banner",
        "-loglevel",
        FFMPEG_LOGLEVEL,
        "-ss",
        f"{start:.3f}",
        "-i",
        str(input_audio),
        "-t",
        f"{duration:.3f}",
    ]

    copy_cmd = base_cmd + ["-c:a", audio_codec, str(output_audio)]
    result = subprocess.run(copy_cmd, capture_output=True)
    if result.returncode == 0:
        return

    fallback_cmd = base_cmd + ["-c:a", "libopus", "-b:a", "96k", str(output_audio)]
    fallback = subprocess.run(fallback_cmd, capture_output=True)
    if fallback.returncode != 0:
        stderr = fallback.stderr.decode("utf-8", "ignore")
        raise ProcessingError(
            f"ffmpeg failed for segment {output_audio.name}:{stderr.strip()}"
        )

In [4]:
# --- Required metadata ---
ID_PREFIX = "singlungfaaisai1"
GENRE = "乘龍快婿"
ALBUM = "乘龍快婿第1部"

# --- File system layout ---
INPUT_AUDIO_DIR = Path("download")
INPUT_VTT_DIR = Path("vtt")
OUTPUT_DIR = Path("cut")
METADATA_PATH = OUTPUT_DIR / "metadata.csv"

# --- Behaviour toggles ---
AUDIO_EXTENSION = ".opus"
ID_PADDING = 4
FFMPEG_BINARY = "ffmpeg"
OVERWRITE_CLIPS = False

print(f"Audio input directory: {INPUT_AUDIO_DIR.resolve()}")
print(f"VTT directory:         {INPUT_VTT_DIR.resolve()}")
print(f"Output directory:      {OUTPUT_DIR.resolve()}")
print(f"Metadata path:         {METADATA_PATH.resolve()}")

Audio input directory: /Users/laufei/Documents/GitHub/gigacan/download
VTT directory:         /Users/laufei/Documents/GitHub/gigacan/vtt
Output directory:      /Users/laufei/Documents/GitHub/gigacan/cut
Metadata path:         /Users/laufei/Documents/GitHub/gigacan/cut/metadata.csv


In [5]:
check_executable(FFMPEG_BINARY)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

records = []
missing_vtt = []
skipped_segments = 0
segment_counter = 0

for audio_path in sorted(INPUT_AUDIO_DIR.glob(f"*{AUDIO_EXTENSION}")):
    if not audio_path.is_file():
        continue
    vtt_path = INPUT_VTT_DIR / f"{audio_path.stem}.vtt"
    if not vtt_path.exists():
        missing_vtt.append(vtt_path.name)
        continue

    segments = parse_vtt_segments(vtt_path)
    if not segments:
        print(f"No segments found in {vtt_path.name}; skipping")
        continue

    for index, segment in enumerate(segments, start=1):
        text = " ".join(segment.text.split()).strip()
        duration = round(segment.end - segment.start, 6)
        if not text:
            skipped_segments += 1
            continue
        if duration <= 0.0:
            skipped_segments += 1
            continue

        segment_counter += 1
        clip_name = f"{audio_path.stem}_{index:0{ID_PADDING}d}{AUDIO_EXTENSION}"
        clip_path = OUTPUT_DIR / clip_name

        try:
            slice_audio(
                ffmpeg_binary=FFMPEG_BINARY,
                input_audio=audio_path,
                output_audio=clip_path,
                start=segment.start,
                end=segment.end,
                audio_codec="copy",
                overwrite=OVERWRITE_CLIPS,
            )
        except ProcessingError as exc:
            print(f"Failed to process {clip_name}: {exc}")
            skipped_segments += 1
            segment_counter -= 1
            continue

        try:
            relative_path = clip_path.relative_to(Path.cwd())
        except ValueError:
            relative_path = clip_path

        record = {
            "audio": relative_path.as_posix(),
            "id": f"{ID_PREFIX}-{segment_counter:0{ID_PADDING}d}",
            "duration": round(duration, 6),
            "genre": GENRE,
            "album": ALBUM,
            "text": text,
        }
        records.append(record)

fieldnames = ["audio", "id", "duration", "genre", "album", "text"]
with METADATA_PATH.open("w", encoding="utf-8", newline="") as handle:
    writer = csv.DictWriter(handle, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(records)

print(f"✅ Wrote {len(records)} clips to {METADATA_PATH}")
print(f"   Skipped segments: {skipped_segments}")
if missing_vtt:
    print("⚠️ Missing VTT files for:")
    for name in missing_vtt:
        print(f"  - {name}")

✅ Wrote 7797 clips to cut/metadata.csv
   Skipped segments: 0
⚠️ Missing VTT files for:
  - 第102集【乘龙怪婿 第1季】#粤语 #张颂文 [JsL4dOJ16bg].vtt
  - 第103集【乘龙怪婿 第1季】#粤语 #张颂文 [7K1-guJusjU].vtt
  - 第104集【乘龙怪婿 第1季】#粤语 #张颂文 [Ra4r4APHUos].vtt
  - 第105集【乘龙怪婿 第1季】#粤语 #张颂文 [AsNyGBYHRXY].vtt
  - 第106集【乘龙怪婿 第1季】#粤语 #张颂文 [Av-7DSu5a2Y].vtt
  - 第107集【乘龙怪婿 第1季】#粤语 #张颂文 [jrMFRV3TRW0].vtt
  - 第108集【乘龙怪婿 第1季】#粤语 #张颂文 [YNoa5PWsDo0].vtt
  - 第109集【乘龙怪婿 第1季】#粤语 #张颂文 [sFKg_szOEn8].vtt
  - 第110集【乘龙怪婿 第1季】#粤语 #张颂文 [y5L3d4TZOjg].vtt
  - 第111集【乘龙怪婿 第1季】#粤语 #张颂文 [kP7uR25mgT0].vtt
  - 第112集【乘龙怪婿 第1季】#粤语 #张颂文 [f3XVB5dPwRk].vtt
  - 第113集【乘龙怪婿 第1季】#粤语 #张颂文 [oI_JXr9TD_Y].vtt
  - 第114集【乘龙怪婿 第1季】#粤语 #张颂文 [kDCrudQ3_T8].vtt
  - 第115集【乘龙怪婿 第1季】#粤语 #张颂文 [PfOCj5Lnk9I].vtt
  - 第116集【乘龙怪婿 第1季】#粤语 #张颂文 [PTuP6Enfc4A].vtt
  - 第117集【乘龙怪婿 第1季】#粤语 #张颂文 [lhDehJB3aPk].vtt
  - 第118集【乘龙怪婿 第1季】#粤语 #张颂文 [Sd4PhBrBA-c].vtt
  - 第119集【乘龙怪婿 第1季】#粤语 #张颂文 [EY9yVJew9oA].vtt
  - 第120集【乘龙怪婿 第1季】#粤语 #张颂文 [humKdze0gq4].vtt
  - 第51集【乘龙怪婿 第1季】#粤语 #张颂文 [-_yyjBcEzq

In [6]:
records[:5]

[{'audio': 'cut/1080高清修复｜ 第10集【乘龙怪婿 第1季】#粤语 #张颂文 [I2q3X3ECt0M]_0001.opus',
  'id': 'singlungfaaisai1-0001',
  'duration': 30.0,
  'genre': '乘龍快婿',
  'album': '乘龍快婿第1部',
  'text': '喂，咁大間客棧話賣就賣喎，個價仲咁高，嚇，叫我去邊度霎時間揾咁多現銀？係咪啫？哎呀，唉，總之呢樣嘢咧，你唔使理咁多㗎啦，你淨係負責出面啫嘛。出面？你話出就出，我咪好冇面？嚇，你，去有得陣，打聽打聽，我萬金尤嘅招牌，響噹噹㗎噃。唉，係人都知道你萬姑娘嘅金漆招牌響叮噹㗎啦，咁先嚟揾你啫嘛。嗱噉，你幫手咧，誒喺我外父。'},
 {'audio': 'cut/1080高清修复｜ 第10集【乘龙怪婿 第1季】#粤语 #张颂文 [I2q3X3ECt0M]_0002.opus',
  'id': 'singlungfaaisai1-0002',
  'duration': 8.9,
  'genre': '乘龍快婿',
  'album': '乘龍快婿第1部',
  'text': '手上面買咗間客棧落嚟，最多咁啦，我畀5%嘅回扣你，點？哇唉，多咧，噉，真係唔方唔多。'},
 {'audio': 'cut/1080高清修复｜ 第10集【乘龙怪婿 第1季】#粤语 #张颂文 [I2q3X3ECt0M]_0003.opus',
  'id': 'singlungfaaisai1-0003',
  'duration': 30.0,
  'genre': '乘龍快婿',
  'album': '乘龍快婿第1部',
  'text': '人哋推銷個奶嘴，都有20%啦，何況咁大間客棧。你想點？最少都呢個數啦。哇，20%，重唔重手啲？唔重手㗎啦，嚇，等你做咗客棧嘅掌櫃，豬籠入水，大把有得你賺啦。正所謂小財唔出咧，就大財唔入，係嘛？咪咁肉赤啦。哇，食水咁深，你都算冇陰功咯喎。我算冇陰功，喂，嚇，比唔上你。'},
 {'audio': 'cut/1080高清修复｜ 第10集【乘龙怪婿 第1季】#粤语 #张颂文 [I2q3X3ECt0M]_0004.opus',
  'id': 'singlungfaaisai1-0004',
  'durati