<a href="https://colab.research.google.com/github/kumakuma131355-art/-/blob/main/%E5%AE%8C%E6%88%90%E7%89%88.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# === セル1：設定 ===
PARENT = '/content/drive/MyDrive/zundashorts'  # 親フォルダ
JOB    = 'job018'                              # ジョブ名

# 出力映像設定
FPS = 30
RESOLUTION = '1080x1920'

# （任意）VOICEVOXで自動TTSする場合だけURLを指定（Colabから到達できる場所）
# 例: 'http://127.0.0.1:50021' はColabからは不可（ローカルなので）
VOICEVOX_URL = None
SPEAKER_NAME = 'ずんだもん'  # 'ずんだもん' を含むスタイルを自動選択


In [None]:
# === セル2：Driveマウント＆パス ===
from pathlib import Path

from google.colab import drive
drive.mount('/content/drive')

ROOT   = Path(PARENT)
JOBDIR = ROOT / 'ずんだ_ジョブ' / JOB
JOBDIR.mkdir(parents=True, exist_ok=True)

print('ROOT  :', ROOT)
print('JOBDIR:', JOBDIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ROOT  : /content/drive/MyDrive/zundashorts
JOBDIR: /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018


In [None]:
# === セル3：インポート＆ユーティリティ ===
import os, csv, sys, io, shutil, subprocess, time, json
from pathlib import Path
from typing import List, Dict, Optional

def msg(*a): print("[INFO]", *a)
def warn(*a): print("[WARN]", *a)
def err(*a): print("[ERROR]", *a)

def ffprobe_duration(path: Path) -> float:
    """音声の長さ（秒）を取得"""
    try:
        out = subprocess.check_output([
            "ffprobe","-v","error","-select_streams","a:0",
            "-show_entries","stream=duration","-of","default=nw=1:nk=1", str(path)
        ]).decode().strip()
        return float(out)
    except Exception:
        out2 = subprocess.check_output([
            "ffprobe","-v","error","-show_entries","format=duration",
            "-of","default=nw=1:nk=1", str(path)
        ]).decode().strip()
        return float(out2)

def run_ffmpeg(cmd: List[str]):
    print("+", " ".join(cmd))
    subprocess.run(cmd, check=True)


In [None]:
# === セル4：CSV検出・読み込み・ソフトバリデーション ===
ALLOWED_EMOTIONS = {"normal","joy","sad","angry","surprised","doya","think","insight","lecture","facepalm","sweat","sparkle","pointing","peeking"}
ALLOWED_MOVES = {"静止","ズームイン","ズームアウト","右へパン","左へパン","上へパン","下へパン","ズームインしながら右へパン"}
ALLOWED_BACKGROUNDS = {"background01.png","background02.png","background03.png","background04.png"}

def find_csv(jobdir: Path) -> Path:
    """job直下 or job/台本/ にある台本.csvを探す"""
    cands = [jobdir / '台本.csv', jobdir / '台本' / '台本.csv']
    for p in cands:
        if p.exists(): return p
    raise FileNotFoundError(f"台本.csv が見つかりません。{cands[0]} または {cands[1]} に置いてください。")

def read_csv_rows(csv_path: Path) -> List[Dict]:
    rows, header = [], None
    with csv_path.open("r", encoding="utf-8-sig", newline="") as f:
        r = csv.reader(f)
        for i, line in enumerate(r, start=1):
            if i == 1:
                header = line
                continue
            if not line or all(not (c or '').strip() for c in line):
                continue
            if len(line) != 4:
                warn(f"{i}行目: 4列である必要があります。got={line}（続行）")
                continue
            rows.append({
                "emotion": (line[0] or "").strip(),
                "text":    (line[1] or "").rstrip(),
                "bg":      (line[2] or "").strip(),
                "move":    (line[3] or "").strip(),
            })
    expect = ["感情","セリフ","背景画像ファイル名","背景の動き"]
    if header != expect:
        warn(f"CSVヘッダが想定と違います。got={header} expected={expect}（続行）")
    return rows

def soft_validate(rows: List[Dict]):
    if not (8 <= len(rows) <= 12):
        warn(f"推奨行数8〜12を外れています now={len(rows)}（続行）")
    for idx, r in enumerate(rows, start=1):
        if r["emotion"] and r["emotion"] not in ALLOWED_EMOTIONS:
            warn(f"{idx}行目: 未許可の感情 '{r['emotion']}'（続行）")
        if not r["text"].startswith("ずんだもん, "):
            warn(f"{idx}行目: セリフは 'ずんだもん, ' 開始推奨（続行）")
        if "「" in r["text"] or "」" in r["text"]:
            warn(f"{idx}行目: 鍵かっこ「」は非推奨（続行）")
        if r["bg"] and r["bg"] not in ALLOWED_BACKGROUNDS:
            warn(f"{idx}行目: 背景は {sorted(ALLOWED_BACKGROUNDS)} 推奨 got={r['bg']}（続行）")
        if r["move"] and r["move"] not in ALLOWED_MOVES:
            warn(f"{idx}行目: 未許可の背景の動き '{r['move']}'（続行）")

# 実行（読み込み）
CSV_PATH = find_csv(JOBDIR)
ROWS = read_csv_rows(CSV_PATH)
soft_validate(ROWS)
print("CSV:", CSV_PATH, " / rows:", len(ROWS))


[WARN] 推奨行数8〜12を外れています now=14（続行）
CSV: /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/台本/台本.csv  / rows: 14


In [None]:
# === セル5：付随ファイルの出力（VOICEVOX用.txt / 必要ファイルリスト.txt） ===
def write_voicevox_and_assets(csv_path: Path, rows: List[Dict]):
    out_dir = csv_path.parent
    # VOICEVOX用.txt
    vv = "\n".join(r["text"] for r in rows if r["text"].strip()) + "\n"
    (out_dir / "VOICEVOX用.txt").write_text(vv, encoding="utf-8")

    # 必要ファイルリスト.txt
    uniq_emotions = sorted({r["emotion"] for r in rows if r["emotion"]})
    uniq_bgs = sorted({r["bg"] for r in rows if r["bg"]})
    lines = ["### 立ち絵素材 (zunda_{感情}_close.png / zunda_{感情}_open.png)"]
    for e in uniq_emotions:
        lines += [f"- zunda_{e}_close.png", f"- zunda_{e}_open.png"]
    lines += ["", "### 背景素材"]
    for b in uniq_bgs:
        lines.append(f"- {b}")
    lines += ["", "### 音声素材 (セリフの行数分)"]
    for i in range(1, len(rows)+1):
        lines.append(f"- {i:02d}_audio.wav")
    (out_dir / "必要ファイルリスト.txt").write_text("\n".join(lines), encoding="utf-8")
    msg("VOICEVOX用.txt / 必要ファイルリスト.txt を出力:", out_dir)

write_voicevox_and_assets(CSV_PATH, ROWS)


[INFO] VOICEVOX用.txt / 必要ファイルリスト.txt を出力: /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/台本


In [None]:
# === セル6：背景→scene_XX.png 展開 ===
def find_background(jobdir: Path, name: str) -> Optional[Path]:
    name = Path(name).name  # sanitize
    for cand in [jobdir/name, jobdir/'背景'/name, jobdir/'backgrounds'/name]:
        if cand.exists():
            return cand
    return None

def build_scene_images(jobdir: Path, rows: List[Dict]) -> List[Path]:
    out_dir = jobdir / 'images_from_bg'
    out_dir.mkdir(parents=True, exist_ok=True)
    outs: List[Path] = []
    for i, r in enumerate(rows, start=1):
        bg = r["bg"].strip()
        if not bg:
            warn(f"scene {i}: 背景が空（スキップ）")
            continue
        src = find_background(jobdir, bg)
        if not src:
            warn(f"scene {i}: 背景が見つからない -> {bg}（スキップ）")
            continue
        dst = out_dir / f"scene_{i:02d}.png"
        shutil.copyfile(src, dst)
        outs.append(dst)
    msg(f"scene画像を作成: {len(outs)} 枚 -> {out_dir}")
    return outs

SCENE_IMAGES = build_scene_images(JOBDIR, ROWS)


[INFO] scene画像を作成: 14 枚 -> /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/images_from_bg


In [None]:
# === セル7：（任意）VOICEVOXで音声自動生成 ===
import requests

def _find_speaker_id(url: str, name_contains: str) -> int:
    r = requests.get(f"{url}/speakers", timeout=10)
    r.raise_for_status()
    speakers = r.json()
    for sp in speakers:
        if name_contains in sp.get("name", ""):
            styles = sp.get("styles") or []
            if styles:
                return styles[0]["id"]
    return speakers[0]["styles"][0]["id"]

def tts_with_voicevox(url: str, lines: List[str], out_dir: Path, speaker_name: str = "ずんだもん") -> List[Path]:
    out_dir.mkdir(parents=True, exist_ok=True)
    sid = _find_speaker_id(url, speaker_name)
    outs: List[Path] = []
    for idx, text in enumerate(lines, start=1):
        text = (text or "").strip()
        if not text:
            continue
        q = requests.post(f"{url}/audio_query", params={"text": text, "speaker": sid}, timeout=30)
        q.raise_for_status()
        query = q.json()
        s = requests.post(f"{url}/synthesis", params={"speaker": sid}, json=query, timeout=60)
        s.raise_for_status()
        wav = out_dir / f"{idx:02d}_audio.wav"
        wav.write_bytes(s.content)
        outs.append(wav)
        time.sleep(0.05)
    msg(f"VOICEVOXで音声を生成: {len(outs)} ファイル -> {out_dir}")
    return outs

# 既存の音声（job/音声 または job直下）を優先。足りなければVOICEVOX_URLがある場合のみ自動生成。
AUDIO_FILES: List[Path] = []
for i in range(1, len(ROWS)+1):
    for cand in [JOBDIR/'音声'/f"{i:02d}_audio.wav", JOBDIR/f"{i:02d}_audio.wav"]:
        if cand.exists():
            AUDIO_FILES.append(cand); break

if len(AUDIO_FILES) < len(ROWS) and VOICEVOX_URL:
    lines = [r["text"] for r in ROWS]
    AUDIO_FILES = tts_with_voicevox(VOICEVOX_URL, lines, JOBDIR/'音声', SPEAKER_NAME)

print("音声数:", len(AUDIO_FILES))


音声数: 12


In [None]:
# === セル8：モーション＋立ち絵（黒帯回避＆連続ズーム）＋音声ギャップ対策 ===
# 前提：セル1〜7で以下が定義済み
#  - ROOT, JOBDIR, RESOLUTION, FPS
#  - ROWS: 台本CSVの各行 dict（"move": 背景の動き, "emotion": 感情）
#  - SCENE_IMAGES: 各行の背景画像Path配列（※不足してもOK：循環使用）
#  - AUDIO_FILES:  各行の音声ファイルPath配列（wav等）
#  - run_ffmpeg(cmd: list), ffprobe_duration(path), msg(*args), warn(*args)

from pathlib import Path
from typing import List, Dict

# ===== 調整パラメータ =====
# パン（移動）時に端が見えないよう少し拡大してから切り出す倍率
PAN_OVERSCAN = 1.12

# 立ち絵（右下小さめ配置が既定）
STAND_HEIGHT_RATIO = 0.45   # 画面高に対する比率（例 0.35〜0.50）
STAND_ANCHOR = 'right'      # 'left' | 'center' | 'right'
STAND_MARGIN_X = 40
STAND_MARGIN_Y = 40

# 口パク（2枚口パクの切替周期）
MOUTH_PERIOD = 0.20         # 秒（=5Hz）

# 連続ズームのパラメータ（Ken Burns風）
ZOOM_IN_START   = 1.00
ZOOM_IN_END     = 1.12      # ゆっくりなら 1.06〜1.10
ZOOM_OUT_START  = 1.12
ZOOM_OUT_END    = 1.00
ZOOM_EASING     = "cos"     # "linear" or "cos"（cos=自然なイージング）
# ==========================

ow, oh = map(int, RESOLUTION.split('x'))

# 立ち絵探索（job優先 → 共有）
STAND_DIR_CANDIDATES = [
    JOBDIR / '立ち絵',
    JOBDIR / 'characters',
    ROOT  / 'ずんだ_素材' / '立ち絵',
    ROOT  / '立ち絵',
    ROOT  / 'characters',
    ROOT  / 'ずんだ_素材' / 'characters',
]

def resolve_stand_paths(emotion: str):
    """感情に応じた open/close PNGを候補ディレクトリから解決。無ければ normal にフォールバック。"""
    def search_pair(em):
        for d in STAND_DIR_CANDIDATES:
            close = d / f"zunda_{em}_close.png"
            openp = d / f"zunda_{em}_open.png"
            if close.exists() and openp.exists():
                return close, openp
        return None, None
    close_png, open_png = search_pair(emotion)
    if close_png and open_png:
        return close_png, open_png
    close_png, open_png = search_pair("normal")
    return (close_png, open_png) if (close_png and open_png) else (None, None)

def anchor_xy():
    if STAND_ANCHOR == 'left':
        x = str(STAND_MARGIN_X)
    elif STAND_ANCHOR == 'right':
        x = f"W-w-{STAND_MARGIN_X}"
    else:
        x = "(W-w)/2"
    y = f"H-h-{STAND_MARGIN_Y}"
    return x, y

def stand_overlay_block(stand_h: int, x_expr: str, y_expr: str) -> str:
    half = MOUTH_PERIOD / 2.0
    return (
        f"[2:v]scale=-1:{stand_h}[sclose];"
        f"[3:v]scale=-1:{stand_h}[sopen];"
        f"[bg][sclose]overlay=x={x_expr}:y={y_expr}:enable='lt(mod(t,{MOUTH_PERIOD}),{half})'[tmp];"
        f"[tmp][sopen]overlay=x={x_expr}:y={y_expr}:enable='gte(mod(t,{MOUTH_PERIOD}),{half})'[v]"
    )

def build_bg_filter(move: str, dur: float, fps: int) -> str:
    """黒帯を出さない“カバー（increase+crop）”を必ず通してから各効果へ"""
    frames = max(1, int(round(dur * fps)))

    # まずは常にフレームをカバー（黒帯回避）
    base_cover = (
        f"[0:v]scale={ow}:{oh}:force_original_aspect_ratio=increase,"
        f"crop={ow}:{oh},setsar=1"
    )

    # イージング付きズーム値（zoompan用）
    def zoom_val_expr(frames:int, start:float, end:float, easing:str)->str:
        if easing == "linear":
            t = f"(on/{frames})"
        else:
            # cosイージング：0→1を滑らかに
            t = f"(0.5-0.5*cos(PI*on/{frames}))"
        expr = f"({start}) + (({end})-({start}))*{t}"
        zmax = max(start, end)
        return f"clip({expr}, 1.0, {zmax})"

    if move == "静止":
        return f"{base_cover}[bg]"

    if move == "ズームイン":
        zexpr = zoom_val_expr(frames, ZOOM_IN_START, ZOOM_IN_END, ZOOM_EASING)
        return (
            f"{base_cover},"
            f"zoompan=z='{zexpr}':"
            f"x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':"
            f"d={frames}:s={ow}x{oh},fps={fps},setsar=1[bg]"
        )

    if move == "ズームアウト":
        zexpr = zoom_val_expr(frames, ZOOM_OUT_START, ZOOM_OUT_END, ZOOM_EASING)
        return (
            f"{base_cover},"
            f"zoompan=z='{zexpr}':"
            f"x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':"
            f"d={frames}:s={ow}x{oh},fps={fps},setsar=1[bg]"
        )

    # パン各種（黒縁回避のためオーバースキャン後に切り出し）
    pan_base = f"{base_cover},scale=iw*{PAN_OVERSCAN}:ih*{PAN_OVERSCAN},setsar=1"
    if move == "右へパン":
        return f"{pan_base},crop={ow}:{oh}:x='(iw-{ow})*t/{dur}':y='(ih-{oh})/2'[bg]"
    if move == "左へパン":
        return f"{pan_base},crop={ow}:{oh}:x='(iw-{ow})*(1 - t/{dur})':y='(ih-{oh})/2'[bg]"
    if move == "上へパン":
        return f"{pan_base},crop={ow}:{oh}:x='(iw-{ow})/2':y='(ih-{oh})*(1 - t/{dur})'[bg]"
    if move == "下へパン":
        return f"{pan_base},crop={ow}:{oh}:x='(iw-{ow})/2':y='(ih-{oh})*t/{dur}'[bg]"

    if move == "ズームインしながら右へパン":
        zexpr = zoom_val_expr(frames, ZOOM_IN_START, ZOOM_IN_END, ZOOM_EASING)
        return (
            f"{base_cover},"
            f"zoompan=z='{zexpr}':"
            f"x='max(0,min(iw - iw/zoom, (iw - iw/zoom) * on / {frames}))':"
            f"y='ih/2-(ih/zoom/2)':"
            f"d={frames}:s={ow}x{oh},fps={fps},setsar=1[bg]"
        )

    # 予備（未知指定時）
    return f"{base_cover}[bg]"

def concat_segments_motion_with_stand(images: List[Path], audios: List[Path], out_path: Path, rows: List[Dict], fps: int = 30):
    # 行数は音声（=台本行数）を正とし、背景は循環使用
    n = len(audios)
    if n == 0:
        warn("結合できるペアが0件（音声が不足）")
        return
    if len(images) == 0:
        warn("背景画像がありません。"); return

    tmp = JOBDIR / "_tmp_segments"
    tmp.mkdir(parents=True, exist_ok=True)
    segs = []
    x_expr, y_expr = anchor_xy()
    stand_h = int(oh * STAND_HEIGHT_RATIO)

    for i in range(n):
        img = images[i % len(images)]   # 背景を循環で使用
        wav = audios[i]
        row = rows[i] if i < len(rows) else {}
        move = (row.get("move") or "静止").strip()
        emo  = (row.get("emotion") or "normal").strip()
        dur = ffprobe_duration(wav)
        bgchain = build_bg_filter(move, dur, fps)

        close_png, open_png = resolve_stand_paths(emo)
        use_stand = bool(close_png and open_png)
        seg = tmp / f"{Path(wav).stem}.mp4"

        if use_stand:
            vf = bgchain + ";" + stand_overlay_block(stand_h, x_expr, y_expr)
            cmd = [
                "ffmpeg","-y",
                "-loop","1","-t", f"{dur:.3f}","-i", str(img),   # 0: 背景
                "-i", str(wav),                                 # 1: 音声
                "-i", str(close_png),                           # 2: 立ち絵（閉）
                "-i", str(open_png),                            # 3: 立ち絵（開）
                "-filter_complex", vf,
                "-map","[v]","-map","1:a:0",
                "-r", str(fps),
                "-c:v","libx264","-pix_fmt","yuv420p",
                "-c:a","aac","-b:a","192k",
                "-shortest", str(seg)
            ]
        else:
            cmd = [
                "ffmpeg","-y",
                "-loop","1","-t", f"{dur:.3f}","-i", str(img),
                "-i", str(wav),
                "-filter_complex", bgchain,
                "-map","[bg]","-map","1:a:0",
                "-r", str(fps),
                "-c:v","libx264","-pix_fmt","yuv420p",
                "-c:a","aac","-b:a","192k",
                "-shortest", str(seg)
            ]
        run_ffmpeg(cmd)
        segs.append(seg)

    # ── 結合：音声のみ再エンコード＋PTS生成で「ブツッ無音」対策 ──
    lst = tmp / "list.txt"
    lst.write_text("".join([f"file '{p.as_posix()}'\n" for p in segs]), encoding="utf-8")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    run_ffmpeg([
        "ffmpeg","-y","-fflags","+genpts",
        "-f","concat","-safe","0","-i", str(lst),
        "-c:v","copy",
        "-af","aresample=async=1:min_hard_comp=0.100:first_pts=0",
        "-c:a","aac","-b:a","192k",
        str(out_path)
    ])

    # 後片付け
    import shutil
    shutil.rmtree(tmp, ignore_errors=True)
    msg("🎬 完成:", out_path)

OUT_MP4 = JOBDIR / 'output' / 'final.mp4'
concat_segments_motion_with_stand(SCENE_IMAGES, AUDIO_FILES, OUT_MP4, ROWS, fps=FPS)


+ ffmpeg -y -loop 1 -t 3.733 -i /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/images_from_bg/scene_01.png -i /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/音声/01_audio.wav -i /content/drive/MyDrive/zundashorts/ずんだ_素材/立ち絵/zunda_normal_close.png -i /content/drive/MyDrive/zundashorts/ずんだ_素材/立ち絵/zunda_normal_open.png -filter_complex [0:v]scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920,setsar=1,zoompan=z='clip((1.0) + ((1.12)-(1.0))*(0.5-0.5*cos(PI*on/112)), 1.0, 1.12)':x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':d=112:s=1080x1920,fps=30,setsar=1[bg];[2:v]scale=-1:864[sclose];[3:v]scale=-1:864[sopen];[bg][sclose]overlay=x=W-w-40:y=H-h-40:enable='lt(mod(t,0.2),0.1)'[tmp];[tmp][sopen]overlay=x=W-w-40:y=H-h-40:enable='gte(mod(t,0.2),0.1)'[v] -map [v] -map 1:a:0 -r 30 -c:v libx264 -pix_fmt yuv420p -c:a aac -b:a 192k -shortest /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/_tmp_segments/01_audio.mp4
+ ffmpeg -y -loop 1 -t 3.797 -i /content/drive/MyDrive/zundashorts/ずん

In [None]:
# === セル9：サマリー ===
print("\n=== SUMMARY ===")
print("台本.csv        :", CSV_PATH)
print("VOICEVOX用.txt  :", CSV_PATH.parent / "VOICEVOX用.txt")
print("必要ファイル…   :", CSV_PATH.parent / "必要ファイルリスト.txt")
print("scene画像数     :", len(SCENE_IMAGES))
print("音声数          :", len(AUDIO_FILES))
print("出力            :", OUT_MP4)



=== SUMMARY ===
台本.csv        : /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/台本/台本.csv
VOICEVOX用.txt  : /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/台本/VOICEVOX用.txt
必要ファイル…   : /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/台本/必要ファイルリスト.txt
scene画像数     : 14
音声数          : 12
出力            : /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/output/final.mp4


In [None]:
# === セル10：BGMミックス v10.2_ultimate（WAVに正規化→合成→映像へ多重化） ===
# ポイント：
# 1) 映像だけ抽出（video_only.mp4）
# 2) VO（ナレーション）を48kHzステレオPCMに正規化（vo.wav）
#    - 音声が無い動画でも anullsrc で無音生成
# 3) BGMを48kHzステレオPCMに正規化（bgm.wav）
#    - まずループ版を試行（-stream_loop）
#    - 失敗したら“無限ループなし”で whole_dur パディング→トリムに自動フォールバック
# 4) （任意）ダッキングしてミックス → mix.wav
# 5) 映像＋mix.wav を多重化して final_with_bgm.mp4（動画長で固定）

from pathlib import Path, PurePosixPath
import subprocess, json, shlex, os

# ------- 調整パラメータ -------
BGM_VOL = 0.5
USE_DUCKING = True
FADE_IN = 0.6
FADE_OUT = 0.8
BGM_NAME_HINT = None      # 例: "main" を優先
A_SR = 48000
A_LAYOUT = "stereo"       # "stereo"固定に正規化
DUCK_THRESHOLD = 0.05
DUCK_RATIO = 6
DUCK_ATTACK = 20
DUCK_RELEASE = 250
# BGMを明示指定したいとき（任意）：例）ROOT / "ずんだ_素材/BGM/main.mp3"
# BGM_FORCE_PATH = ROOT / "ずんだ_素材/立ち絵/main_bgm.mp3"
# -----------------------------

def try_run(cmd:list, check=True):
    # 既存の run_ffmpeg があれば利用、なければ subprocess
    try:
        run_ffmpeg(cmd)
        return 0, ""
    except NameError:
        try:
            out = subprocess.run(cmd, check=check, capture_output=True, text=True)
            return out.returncode, out.stderr
        except subprocess.CalledProcessError as e:
            return e.returncode, e.stderr or e.stdout or ""

def ffprobe_v_duration(path:Path):
    try:
        out = subprocess.check_output([
            "ffprobe","-v","error","-select_streams","v:0",
            "-show_entries","stream=duration","-of","json", str(path)
        ]).decode("utf-8","ignore")
        d = float(json.loads(out)["streams"][0]["duration"])
        return d
    except Exception:
        out = subprocess.check_output([
            "ffprobe","-v","error",
            "-show_entries","format=duration",
            "-of","default=nokey=1:noprint_wrappers=1", str(path)
        ]).decode("utf-8","ignore").strip()
        return float(out)

def has_audio_stream(path:Path)->bool:
    try:
        subprocess.check_output([
            "ffprobe","-v","error","-select_streams","a:0",
            "-show_entries","stream=index","-of","csv=p=0", str(path)
        ])
        return True
    except subprocess.CalledProcessError:
        return False

AUDIO_EXTS = {".mp3",".wav",".m4a",".ogg",".flac"}
def list_audio(dirpath:Path):
    if not dirpath or not dirpath.exists(): return []
    return sorted([p for p in dirpath.iterdir() if p.suffix.lower() in AUDIO_EXTS])

def choose_bgm(cands, hint=None):
    if not cands: return None
    ranked = []
    for p in cands:
        name = p.stem.lower()
        score = 0
        if hint and hint.lower() in name: score += 100
        for key,val in {"main":50,"bgm":40,"loop":30,"music":20}.items():
            if key in name: score += val
        score += max(0, 10 - len(name))
        ranked.append((score,p))
    ranked.sort(key=lambda x:(-x[0], x[1].name))
    return ranked[0][1]

def pick_bgm(jobdir:Path, root:Path, hint=None):
    dirs = [
        jobdir/"BGM", jobdir/"bgm", jobdir,
        root/"ずんだ_素材"/"BGM",
        root/"ずんだ_素材"/"立ち絵",
        root/"BGM", root
    ]
    files = []
    for d in dirs: files += list_audio(d)
    return choose_bgm(files, hint=hint)

# --- 準備 ---
out_dir = JOBDIR / "output"
out_dir.mkdir(parents=True, exist_ok=True)
total_v_dur = ffprobe_v_duration(OUT_MP4)

video_only = out_dir / "video_only.mp4"
vo_wav     = out_dir / "vo.wav"
bgm_wav    = out_dir / "bgm.wav"
mix_wav    = out_dir / "mix.wav"
final_bgm  = out_dir / "final_with_bgm.mp4"

# 1) 映像だけ抽出（コピー）
rc, err = try_run([
    "ffmpeg","-y","-hide_banner","-loglevel","error",
    "-i", str(OUT_MP4),
    "-an","-c:v","copy", str(video_only)
])
if rc != 0:
    warn("映像の抽出に失敗：", err)

# 2) VOをWAVに正規化（音声が無い場合は無音を生成）
if has_audio_stream(OUT_MP4):
    rc, err = try_run([
        "ffmpeg","-y","-hide_banner","-loglevel","error",
        "-i", str(OUT_MP4),
        "-vn","-ac", "2", "-ar", str(A_SR),
        "-c:a","pcm_s16le",
        "-t", f"{total_v_dur:.3f}",
        str(vo_wav)
    ])
else:
    rc, err = try_run([
        "ffmpeg","-y","-hide_banner","-loglevel","error",
        "-f","lavfi","-t", f"{total_v_dur:.3f}",
        "-i", f"anullsrc=channel_layouts={A_LAYOUT}:sample_rate={A_SR}",
        "-ac","2","-ar", str(A_SR), "-c:a","pcm_s16le",
        str(vo_wav)
    ])
if rc != 0:
    warn("VO正規化に失敗：", err)

# 3) BGM選定 → WAV化（まずループ試行、失敗時はパッド版）
if "BGM_FORCE_PATH" in globals() and BGM_FORCE_PATH and Path(BGM_FORCE_PATH).exists():
    bgm_src = Path(BGM_FORCE_PATH)
else:
    bgm_src = pick_bgm(JOBDIR, ROOT, hint=BGM_NAME_HINT)

if not bgm_src:
    warn("BGMが見つかりません。job/BGM/ か 共有の ずんだ_素材/BGM・立ち絵 に音源を置いてね。")
else:
    # 3a) ループ版（stream_loop）→ 失敗したら 3b) パッド版（whole_dur）
    # ループ＋音量＋フェード→48k/ステレオPCM
    rc, err = try_run([
        "ffmpeg","-y","-hide_banner","-loglevel","error",
        "-stream_loop","-1","-i", str(bgm_src),
        "-t", f"{total_v_dur:.3f}",
        "-af", f"volume={BGM_VOL}"
               + (f",afade=t=in:st=0:d={FADE_IN}" if FADE_IN>0 else "")
               + (f",afade=t=out:st={max(0.0,total_v_dur-FADE_OUT):.3f}:d={FADE_OUT}" if FADE_OUT>0 else ""),
        "-ac","2","-ar", str(A_SR), "-c:a","pcm_s16le",
        str(bgm_wav)
    ], check=True)

    if rc != 0:
        # パッド版（無限ループせず、足りない分は無音パッド→トリム）
        rc2, err2 = try_run([
            "ffmpeg","-y","-hide_banner","-loglevel","error",
            "-i", str(bgm_src),
            "-af", f"aformat=sample_fmts=fltp:sample_rates={A_SR}:channel_layouts={A_LAYOUT},"
                   f"apad=whole_dur={total_v_dur:.3f},"
                   f"atrim=0:{total_v_dur:.3f},"
                   f"volume={BGM_VOL}"
                   + (f",afade=t=in:st=0:d={FADE_IN}" if FADE_IN>0 else "")
                   + (f",afade=t=out:st={max(0.0,total_v_dur-FADE_OUT):.3f}:d={FADE_OUT}" if FADE_OUT>0 else ""),
            "-ac","2","-ar", str(A_SR), "-c:a","pcm_s16le",
            str(bgm_wav)
        ], check=True)
        if rc2 != 0:
            warn("BGM正規化に失敗：", err2)

# 4) ダッキングしてミックス → mix.wav
if (vo_wav.exists() and bgm_wav.exists()):
    if USE_DUCKING:
        # sidechaincompress でBGMを抑え、VOとamix
        rc, err = try_run([
            "ffmpeg","-y","-hide_banner","-loglevel","error",
            "-i", str(vo_wav), "-i", str(bgm_wav),
            "-filter_complex",
            f"[1:a][0:a]sidechaincompress="
            f"threshold={DUCK_THRESHOLD}:ratio={DUCK_RATIO}:attack={DUCK_ATTACK}:release={DUCK_RELEASE}[duck];"
            f"[0:a][duck]amix=inputs=2:duration=longest:dropout_transition=0,"
            f"aresample=async=1:first_pts=0",
            "-ac","2","-ar", str(A_SR), "-c:a","pcm_s16le",
            str(mix_wav)
        ])
        if rc != 0:
            warn("ダッキング合成に失敗。ダッキング無しで再試行するのだ。", err)
    if (not mix_wav.exists()) or mix_wav.stat().st_size == 0:
        # シンプルミックス
        rc, err = try_run([
            "ffmpeg","-y","-hide_banner","-loglevel","error",
            "-i", str(vo_wav), "-i", str(bgm_wav),
            "-filter_complex",
            f"[0:a][1:a]amix=inputs=2:duration=longest:dropout_transition=0,aresample=async=1:first_pts=0",
            "-ac","2","-ar", str(A_SR), "-c:a","pcm_s16le",
            str(mix_wav)
        ])
        if rc != 0:
            warn("シンプル合成にも失敗：", err)

# 5) 映像＋音声を多重化（動画長で固定）
if video_only.exists() and mix_wav.exists():
    rc, err = try_run([
        "ffmpeg","-y","-hide_banner","-loglevel","error",
        "-i", str(video_only), "-i", str(mix_wav),
        "-map","0:v:0","-map","1:a:0",
        "-c:v","copy", "-c:a","aac","-b:a","192k",
        "-t", f"{total_v_dur:.3f}",
        "-movflags","+faststart",
        str(final_bgm)
    ])
    if rc == 0:
        msg("🎵 BGM合成完了（ultimate）：", final_bgm)
    else:
        warn("最終多重化に失敗：", err)
else:
    warn("前段の生成物が不足：", video_only.exists(), mix_wav.exists())


+ ffmpeg -y -hide_banner -loglevel error -i /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/output/final.mp4 -an -c:v copy /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/output/video_only.mp4
+ ffmpeg -y -hide_banner -loglevel error -i /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/output/final.mp4 -vn -ac 2 -ar 48000 -c:a pcm_s16le -t 57.430 /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/output/vo.wav
+ ffmpeg -y -hide_banner -loglevel error -stream_loop -1 -i /content/drive/MyDrive/zundashorts/ずんだ_素材/BGM/BGM.mp3 -t 57.430 -af volume=0.5,afade=t=in:st=0:d=0.6,afade=t=out:st=56.630:d=0.8 -ac 2 -ar 48000 -c:a pcm_s16le /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/output/bgm.wav
+ ffmpeg -y -hide_banner -loglevel error -i /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/output/vo.wav -i /content/drive/MyDrive/zundashorts/ずんだ_ジョブ/job018/output/bgm.wav -filter_complex [1:a][0:a]sidechaincompress=threshold=0.05:ratio=6:attack=20:release=250[duck];[0:a][duck]amix=inputs=2:du