# 使用 gradio_client 的 Index-TTS2 语音互译方案本 Notebook 演示如何在 Google Colab 中通过 `gradio_client` 直接调用 Index-TTS2 接口，实现从视频或音频到目标语音的端到端翻译与配音流程，并自动截取不超过 10 秒的原声作为音色参考。

## 步骤概览1. 安装语音识别与 TTS 所需依赖。2. 配置 Index-TTS2 服务地址以及可选参数。3. 上传源视频或音频并提取人声轨道。4. 通过 Faster-Whisper 自动识别原文并翻译为英文。5. 从原音频自动裁切 ≤10 秒参考片段。6. 读取 Index-TTS2 的 Gradio Schema，自动组装调用参数。7. 调用接口并下载合成结果。

In [None]:
!pip -q install gradio_client==0.8.1 librosa==0.10.2.post1 soundfile==0.12.1 faster-whisper==1.0.0 torch==2.4.1+cu121 torchaudio==2.4.1+cu121 --extra-index-url https://download.pytorch.org/whl/cu121


## 配置 Index-TTS2 与识别参数在下方单元格中填写 Index-TTS2 服务地址、API 名称，以及需要覆盖的其他参数。`CUSTOM_PAYLOAD` 可用于向接口传入额外字段（如情绪控制模式等）。

In [None]:
INDEX_TTS2_ENDPOINT = "https://<your-indextts2-host>"
INDEX_TTS2_API_NAME = "/gen_single"
CUSTOM_PAYLOAD = {
    "emo_control_method": "Same as the voice reference"
}
WHISPER_MODEL = "large-v3"
SOURCE_LANGUAGE = "auto"
USE_TRANSLATION = True
USE_CUDA = True


## 上传源媒体支持常见的视频与音频格式，文件会被保存到会话目录中以便后续处理。

In [None]:
from google.colab import files
from pathlib import Path

work_dir = Path("indextts2_work")
work_dir.mkdir(exist_ok=True)
uploads = files.upload()
input_name, input_bytes = next(iter(uploads.items()))
input_path = work_dir / input_name
with open(input_path, "wb") as f:
    f.write(input_bytes)
print(input_path)


## 提取标准化音频将源文件转为单声道 16 kHz WAV，以便语音识别与后续参考裁剪。

In [None]:
import shutil
import subprocess

AUDIO_SUFFIXES = {".wav", ".flac", ".mp3", ".m4a", ".aac", ".ogg", ".opus", ".wma", ".webm"}
audio_path = work_dir / "source.wav"
if input_path.suffix.lower() in AUDIO_SUFFIXES and input_path.suffix.lower() != ".wav":
    subprocess.run(["ffmpeg", "-y", "-i", str(input_path), "-ac", "1", "-ar", "16000", str(audio_path)], check=True)
elif input_path.suffix.lower() == ".wav":
    shutil.copyfile(input_path, audio_path)
else:
    subprocess.run(["ffmpeg", "-y", "-i", str(input_path), "-vn", "-ac", "1", "-ar", "16000", str(audio_path)], check=True)
print(audio_path)


## 语音识别与翻译使用 Faster-Whisper 获取原文，并在需要时生成英文译文作为合成文本。

In [None]:
import torch
from faster_whisper import WhisperModel

use_cuda = USE_CUDA and torch.cuda.is_available()
device = "cuda" if use_cuda else "cpu"
compute_type = "float16" if use_cuda else "int8"
model = WhisperModel(WHISPER_MODEL, device=device, compute_type=compute_type)
language_arg = None if SOURCE_LANGUAGE.lower() == "auto" else SOURCE_LANGUAGE
segments, info = model.transcribe(str(audio_path), beam_size=5, language=language_arg, task="transcribe")
source_text = " ".join(segment.text.strip() for segment in segments).strip()
text_for_tts = source_text
translation_text = ""
if USE_TRANSLATION:
    translated_segments, _ = model.transcribe(str(audio_path), beam_size=5, language=language_arg, task="translate")
    translation_text = " ".join(segment.text.strip() for segment in translated_segments).strip()
    if translation_text:
        text_for_tts = translation_text
print("原文:", source_text)
print("合成文本:", text_for_tts)


## 裁剪音色参考自动截取前 10 秒内的单声道片段，并保存为 Index-TTS2 可直接上传的 WAV 文件。

In [None]:
import librosa
import soundfile as sf

ref_audio, ref_sr = librosa.load(str(audio_path), sr=None, mono=True)
max_samples = min(len(ref_audio), int(ref_sr * 10))
if max_samples == 0:
    raise RuntimeError("参考音频过短，无法裁剪")
ref_clip = ref_audio[:max_samples]
ref_path = work_dir / "reference.wav"
sf.write(ref_path, ref_clip, ref_sr)
print(ref_path, max_samples / ref_sr)


## 读取 Index-TTS2 接口结构自动解析目标 Endpoint 的参数顺序，确保传入文本与参考音频字段齐全。

In [None]:
from gradio_client import Client, handle_file

client = Client(INDEX_TTS2_ENDPOINT)
api_schema = client.view_api(return_format="dict")
endpoint_schema = api_schema.get("named_endpoints", {}).get(INDEX_TTS2_API_NAME)
if endpoint_schema is None:
    named_values = list(api_schema.get("named_endpoints", {}).values())
    if named_values:
        endpoint_schema = named_values[0]
if endpoint_schema is None:
    unnamed_values = list(api_schema.get("unnamed_endpoints", {}).values())
    if unnamed_values:
        endpoint_schema = unnamed_values[0]
if endpoint_schema is None:
    raise RuntimeError("无法解析 Index-TTS2 接口结构")
parameters = endpoint_schema.get("parameters", [])
print(len(parameters))


## 调用 Index-TTS2 并获取结果根据 Schema 自动组装请求参数，调用后将结果下载到本地目录并直接在 Colab 中试听。

In [None]:
from gradio_client import utils as gr_utils
from IPython.display import Audio, display

output_dir = work_dir / "tts_output"
output_dir.mkdir(exist_ok=True)

def assemble_inputs(param_list, text_value, reference_file, overrides):
    values = []
    for param in param_list:
        name = param.get("parameter_name") or param.get("label")
        component = param.get("component")
        if name in overrides:
            values.append(overrides[name])
        elif component in {"Textbox", "TextArea"}:
            values.append(text_value)
        elif component in {"Audio", "File", "UploadButton"}:
            values.append(handle_file(reference_file))
        elif param.get("parameter_has_default", False):
            values.append(param.get("parameter_default"))
        else:
            raise RuntimeError(f"缺少必要字段: {name}")
    return values

def harvest_files(data, target_dir):
    collected = []
    def visit(node):
        if isinstance(node, dict):
            url = None
            for key in ("url", "path"):
                value = node.get(key)
                if isinstance(value, str) and value.startswith("http"):
                    url = value
                    break
            if url:
                saved = gr_utils.download_tmp_copy_of_file(url, dir=str(target_dir))
                collected.append(saved)
            for value in node.values():
                visit(value)
        elif isinstance(node, (list, tuple, set)):
            for value in node:
                visit(value)
        elif isinstance(node, str) and node.startswith("http"):
            saved = gr_utils.download_tmp_copy_of_file(node, dir=str(target_dir))
            collected.append(saved)
    visit(data)
    return collected

request_values = assemble_inputs(parameters, text_for_tts, ref_path, CUSTOM_PAYLOAD)
raw_response = client.predict(*request_values, api_name=INDEX_TTS2_API_NAME)
print(raw_response)
output_files = harvest_files(raw_response, output_dir)
print(output_files)
for item in output_files:
    display(Audio(item))
