In [4]:
import requests
import numpy as np
import soundfile as sf

# === 1. 加载音频文件，转 float32 PCM ===
wav_path = "/data/kimi_deployment/tests/test_audios/asr_example_cantonese.wav"
audio_data, sample_rate = sf.read(wav_path, dtype="float32")

# 转为二进制流（float32）
pcm_bytes = audio_data.astype(np.float32).tobytes()

# === 2. 构造请求头 + 发送 POST 请求 ===
headers = {
    "name": "self_record27.wav",
    #'language':  'zh',
    'use_custom_language_classifier': 'true'
}

response = requests.post("http://127.0.0.1:8000/transcribe_websocket", headers=headers, data=pcm_bytes)

# === 3. 打印结果 ===
print(response.status_code)
print(response.text)

200
{"result":[{"status":0,"text":"我安安陪人supermarket会买野。","language":"Auto detection","prompt":"请转写以下音频","start":0,"end":5.6285625}],"info":null,"status":0}


In [5]:
import websocket
import json
import soundfile as sf
import numpy as np
import time

# 1. 连接
ws = websocket.create_connection("ws://127.0.0.1:9091")

# 2. 发送初始化参数
options = {
  "uid": "test_user",
  "token": "xxx",
  "name": "test.wav",
  "initial_prompt": "",
  "version": "1.0",
  "model": "faster_whisper",
  "user_id": "abc123",
  "type_name": "developer"
}
ws.send(json.dumps(options))



# 改成 int16 格式 PCM
audio_data, sample_rate = sf.read("/data/kimi_deployment/tests/test_audios/asr_example_cantonese.wav")
pcm_bytes = (audio_data * 32768.0).astype(np.int16).tobytes()
# 4. 发送音频帧
ws.send(pcm_bytes, opcode=websocket.ABNF.OPCODE_BINARY)
 # 模拟实时流式传输，可以根据需要调整


ws.send(b"END_OF_AUDIO")

# 6. 获取返回
while True:
    try:
        msg = ws.recv()
        time.sleep(1)
        print("收到返回：", msg)
    except:
        break

ws.close()

收到返回： {"uid": "test_user", "message": "SERVER_READY", "code": 0, "status": "SERVER_READY", "backend": "vvVoiceServer"}
收到返回： {"uid": "test_user", "code": 0, "status": "RESULT", "segments": [{"start": "0.000", "end": "5.629", "text": "\u6211\u5b89\u5b89\u966a\u4ebasupermarket\u4f1a\u4e70\u91ce\u3002"}], "is_end": true}
收到返回： {"uid": "test_user", "code": 0, "status": "RESULT", "segments": [{"start": "0.000", "end": "5.629", "text": "\u6211\u5b89\u5b89\u966a\u4ebasupermarket\u4f1a\u4e70\u91ce\u3002"}], "is_end": true}
收到返回： {"uid": "test_user", "code": 0, "status": "RESULT", "segments": [{"start": "0.000", "end": "5.629", "text": "\u6211\u5b89\u5b89\u966a\u4ebasupermarket\u4f1a\u4e70\u91ce\u3002"}], "is_end": true}
收到返回： {"uid": "test_user", "code": 0, "status": "RESULT", "segments": [{"start": "0.000", "end": "5.629", "text": "\u6211\u5b89\u5b89\u966a\u4ebasupermarket\u4f1a\u4e70\u91ce\u3002"}], "is_end": true}
收到返回： {"uid": "test_user", "code": 0, "status": "RESULT", "segments": [{"star

In [6]:
import soundfile as sf
import websocket
import time
import numpy as np

def send_audio_ws(audio_path, ws_url, frame_duration=0.4):
    # 1. 读取音频（float32），并检查采样率
    audio_data, sample_rate = sf.read(audio_path)
    assert sample_rate == 16000, "音频采样率必须为 16kHz"

    # 2. 转换为 int16 PCM 格式
    int16_audio = (audio_data * 32768.0).astype(np.int16)
    pcm_bytes = int16_audio.tobytes()

    # 3. 计算帧大小（按时间分帧，如每 0.5 秒）
    frame_size = int(sample_rate * frame_duration)  # 0.5秒 * 16000 = 8000 samples
    num_frames = len(int16_audio) // frame_size

    # 4. 连接 WebSocket
    ws = websocket.create_connection(ws_url)

    # 5. 发送初始化配置（你实际项目可能需要先发 JSON 配置）
    config = {
        "uid": "user001",
        "model": "small.en",
        "task": "transcribe",
        "token": "mock_token_123",
        "version": "v1",
        "use_vad": True,
        "name": "demo.wav"
    }
    ws.send(json.dumps(config))

    time.sleep(0.1)  # 等待服务端准备好

    # 6. 分帧发送 PCM 数据
    for i in range(num_frames + 1):
        start = i * frame_size
        end = start + frame_size
        frame = int16_audio[start:end]
        if len(frame) == 0:
            continue
        ws.send(frame.tobytes(), opcode=websocket.ABNF.OPCODE_BINARY)
        time.sleep(frame_duration)  # 可调：模拟实时发送

    

    # 7. 通知服务端音频结束
    ws.send(b"END_OF_AUDIO")

    # 8. 等待识别结果
    while True:
        try:
            result = ws.recv()
            print("[结果] ", result)
            if '"is_end": true' in result:
                print("✅ 服务端识别结束")
                break
        except Exception as e:
            print(f"[错误] {e}")
            break

    ws.close()

In [7]:
send_audio_ws(
    "/data/kimi_deployment/tests/test_audios/asr_example_cantonese.wav",
    "ws://localhost:9091"
)

[结果]  {"uid": "user001", "message": "SERVER_READY", "code": 0, "status": "SERVER_READY", "backend": "vvVoiceServer"}
[结果]  {"uid": "user001", "code": 0, "status": "RESULT", "segments": [{"start": "0.000", "end": "0.800", "text": "\u6211\u60f3\u8981\u4e70samsung\u7684\u88e4\u5b50\u3002"}], "is_end": false}
[结果]  {"uid": "user001", "code": 0, "status": "RESULT", "segments": [{"start": "0.000", "end": "1.200", "text": "\u6211\u4eec"}], "is_end": false}
[结果]  {"uid": "user001", "code": 0, "status": "RESULT", "segments": [{"start": "0.000", "end": "1.200", "text": "\u6211\u4eec"}], "is_end": false}
[结果]  {"uid": "user001", "code": 0, "status": "RESULT", "segments": [{"start": "0.000", "end": "1.200", "text": "\u6211\u4eec"}], "is_end": false}
[结果]  {"uid": "user001", "code": 0, "status": "RESULT", "segments": [{"start": "0.000", "end": "1.600", "text": "\u7f57\u6069\u6069"}], "is_end": false}
[结果]  {"uid": "user001", "code": 0, "status": "RESULT", "segments": [{"start": "0.000", "end": "1.6

In [None]:
import json
import time
import random
import threading
import numpy as np
import soundfile as sf
import websocket

WS_URL = "ws://127.0.0.1:9091"

# ========= 可调参数 =========
FRAME_MS = 40            # 每帧 40ms（常见 20~50ms）
JITTER_MS = 10           # 发送抖动 ±10ms
INSERT_SILENCE_MS = 600  # 段与段之间插入静音的时长（毫秒）
REALTIME = True          # True = 按时延真实发送；False = 不等待（更快）
LANGUAGE = "zh"          # 或 "en" 等；不需要可设为 None
INITIAL_PROMPT = "请转写以下音频"  # 不需要可设为空字符串
UID = "user_multi_001"

# 多段音频（顺序播放）
SEGMENTS = [
    "/path/to/seg1.wav",
    "/path/to/seg2.wav",
    "/path/to/seg3.wav",
]

def _resample_linear(x, src_sr, dst_sr):
    if src_sr == dst_sr:
        return x
    n = int(round(len(x) * dst_sr / src_sr))
    return np.interp(np.linspace(0, len(x), n, endpoint=False), np.arange(len(x)), x).astype(np.float32)

def _load_pcm16(path, target_sr=16000, mono=True):
    audio, sr = sf.read(path, always_2d=False)
    if audio.ndim == 2 and mono:
        audio = audio.mean(axis=1)
    audio = _resample_linear(audio.astype(np.float32), sr, target_sr)
    # 归一 & 转 PCM16
    pcm16 = (np.clip(audio, -1.0, 1.0) * 32767.0).astype(np.int16)
    return pcm16, target_sr

def _send_stream(ws, pcm16, sr, frame_ms=FRAME_MS, jitter_ms=JITTER_MS, realtime=REALTIME):
    bytes_per_sample = 2
    samples_per_ms = sr // 1000
    frame_n = frame_ms * samples_per_ms

    idx = 0
    while idx < len(pcm16):
        end = min(len(pcm16), idx + frame_n)
        frame = pcm16[idx:end]
        if len(frame) == 0:
            break
        ws.send(frame.tobytes(), opcode=websocket.ABNF.OPCODE_BINARY)
        idx = end

        if realtime:
            # 模拟抖动
            sleep_ms = frame_ms + random.randint(-jitter_ms, jitter_ms)
            time.sleep(max(0.0, sleep_ms / 1000.0))

def _send_silence(ws, ms, sr=16000, frame_ms=FRAME_MS, jitter_ms=JITTER_MS, realtime=REALTIME):
    samples = (sr * ms) // 1000
    if samples <= 0:
        return
    silence = np.zeros(samples, dtype=np.int16)
    _send_stream(ws, silence, sr, frame_ms, jitter_ms, realtime)

def _recv_loop(ws, stop_event):
    ws.settimeout(30)
    last = None
    while not stop_event.is_set():
        try:
            msg = ws.recv()
            if not msg:
                break
            # 去重打印
            if msg != last:
                print("← [RECV]", msg)
                last = msg
            # 简单结束检测
            if '"status":"RESULT"' in msg and '"is_end": true' in msg:
                # 等服务端清尾
                time.sleep(0.1)
                stop_event.set()
                break
        except Exception as e:
            # 正常超时/断开都到这里
            break

In [78]:
import websocket
import sounddevice as sd
import numpy as np
import threading
import json
import time

# 音频参数
SAMPLE_RATE = 16000
FRAME_DURATION = 0.5  # 每帧0.5秒
FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION)

# WebSocket 地址（可用 SSH 隧道或公网IP）
WS_URL = "ws://localhost:9090"  # 或 "ws://your_ip:9090"

# 初始化连接并发送配置信息
def init_ws():
    ws = websocket.create_connection(WS_URL)
    config = {
        "uid": "user001",
        "model": "small.en",
        "task": "transcribe",
        "token": "mock_token_123",
        "version": "v1",
        "use_vad": True,
        "name": "mic_test.wav"
    }
    ws.send(json.dumps(config))
    return ws

# 音频采集线程
def record_and_send(ws):
    def callback(indata, frames, time_info, status):
        # 转 float32 → int16 PCM → bytes
        pcm_bytes = (indata[:, 0] * 32768).astype(np.int16).tobytes()
        ws.send(pcm_bytes, opcode=websocket.ABNF.OPCODE_BINARY)

    with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype='float32',
                        blocksize=FRAME_SIZE, callback=callback):
        print("🎙️ 开始说话，按 Ctrl+C 停止...")
        try:
            while True:
                time.sleep(0.1)
        except KeyboardInterrupt:
            print("⏹️ 停止录音")
            ws.send(b"END_OF_AUDIO")

# 接收转写结果
def receive(ws):
    while True:
        try:
            msg = ws.recv()
            print("📜 识别结果：", msg)
            if '"is_end": true' in msg:
                break
        except Exception as e:
            print("[错误]", e)
            break

# 主函数
if __name__ == "__main__":
    ws = init_ws()

    # 启动接收线程
    recv_thread = threading.Thread(target=receive, args=(ws,))
    recv_thread.start()

    # 开始录音并发送
    record_and_send(ws)

    ws.close()

📜 识别结果： {"uid": "user001", "message": "SERVER_READY", "code": 0, "status": "SERVER_READY", "backend": "vvVoiceServer"}


PortAudioError: Error querying device -1

📜 识别结果： {"uid": "user001", "code": 1002, "status": "TIMEOUT", "message": "Recv Time out!"}
📜 识别结果： 
[错误] Connection to remote host was lost.


In [77]:
import sounddevice as sd
print(sd.query_devices())




In [8]:
import whisper_live.transcriber
import inspect

print(inspect.getfile(whisper_live.transcriber))

/root/miniconda3/envs/whisperlive/lib/python3.8/site-packages/whisper_live/transcriber.py


In [8]:
import sys
sys.path.insert(0, "/data/WhisperLive/whisper_live")

from transcriber import WhisperModel  # 现在就是本地的了
model1 = WhisperModel("/root/ASR_TTS_improvement/models/ct2-whisper-lora",
                      device="cuda", compute_type="float16",with_language_classifier = True)


TypeError: __init__() got an unexpected keyword argument 'with_language_classifier'

In [11]:
import os
import subprocess

pid = os.getpid()
result = subprocess.run(["lsof", "-p", str(pid)], stdout=subprocess.PIPE, text=True)
for line in result.stdout.splitlines():
    if ".so" in line:
        print(line)

pt_main_t 320204 root  mem       REG              0,199 563259240  14026813 /usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.6
pt_main_t 320204 root  mem       REG              0,199  90853824  14026823 /usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.6
pt_main_t 320204 root  mem       REG                8,2   1942744 113515753 /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.535.183.01
pt_main_t 320204 root  mem       REG               8,33     21481  26747989 /root/miniconda3/envs/whisper/lib/python3.8/site-packages/av.libs/libuuid-f64cda11.so.1.3.0
pt_main_t 320204 root  mem       REG               8,33    498873  26678390 /root/miniconda3/envs/whisper/lib/python3.8/site-packages/av/audio/codeccontext.cpython-38-x86_64-linux-gnu.so
pt_main_t 320204 root  mem       REG               8,33    268752  26676864 /root/miniconda3/envs/whisper/lib/python3.8/site-packages/charset_normalizer/md__mypyc.cpython-38-x86_64-linux-gnu.so
pt_main_t 320204 root  mem       REG               8,33    416

In [4]:
import ctranslate2

print("CTranslate2 version:", ctranslate2.__version__)
print("CTranslate2 available devices:", ctranslate2.get_supported_compute_types("cuda"))




CTranslate2 version: 4.4.0
CTranslate2 available devices: {'int8_bfloat16', 'bfloat16', 'int8_float16', 'float32', 'float16', 'int8_float32', 'int8'}


In [9]:

import os
import torch

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

# 当前 CUDA 版本（PyTorch 编译时所用）
print("Compiled CUDA version:", torch.version.cuda)

# 当前运行时实际加载的 CUDA 库
torch.cuda.init()
print("Loaded CUDA version:", torch._C._cuda_getCompiledVersion())
print("CUDA device name:", torch.cuda.get_device_name(0))
print("LD_LIBRARY_PATH:", os.environ.get("LD_LIBRARY_PATH"))
print("PATH:", os.environ.get("PATH"))
print("PYTHONPATH:", os.environ.get("PYTHONPATH"))

import ctypes.util

print("libcublas path:", ctypes.util.find_library("cublas"))
print("libcudnn path:", ctypes.util.find_library("cudnn"))

import os
import subprocess

pid = os.getpid()
print(f"Python PID: {pid}")

# 显示所有当前 Python 进程加载的共享库
print("\nLoaded shared libraries (filtered by cublas):")
subprocess.run(f"lsof -p {pid} | grep cublas", shell=True)

Torch version: 2.3.1+cu118
CUDA available: True
Compiled CUDA version: 11.8
Loaded CUDA version: 11080
CUDA device name: NVIDIA A40
LD_LIBRARY_PATH: /root/miniconda3/envs/whisper/lib:/usr/local/cuda-11.8/lib64
PATH: /root/miniconda3/envs/whisper/bin:/root/.vscode-server/bin/848b80aeb52026648a8ff9f7c45a9b0a80641e2e/bin/remote-cli:/usr/local/cuda-11.8/bin:/root/miniconda3/envs/whisper/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
PYTHONPATH: third_party/Matcha-TTS
libcublas path: libcublas.so.11
libcudnn path: libcudnn.so.8
Python PID: 320204

Loaded shared libraries (filtered by cublas):


CompletedProcess(args='lsof -p 320204 | grep cublas', returncode=0)