In [None]:
# coding: utf-8
import os
import json
import torch
from funasr import AutoModel

# 使用项目测试音频
audio_path = r"C:\Users\ke\Documents\projects\python_projects\HearSight\backend\tests\datas\大语言模型进化论：从“听懂指令”到“学会思考”，AI如何与人类对齐？.m4a"
assert os.path.exists(audio_path), f"文件不存在: {audio_path}"

# 最小模型加载（与项目一致）
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel(
    model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
    model_revision="v2.0.4",
    vad_model="fsmn-vad",
    vad_model_revision="v2.0.4",
    punc_model="ct-punc-c",
    punc_model_revision="v2.0.4",
    device=device,
    disable_update=True,
)

# 推理：返回原始结构
res = model.generate(input=audio_path, cache={}, batch_size_s=300)

print("=== RAW TYPE ===")
print(type(res))


funasr version: 1.2.6.
Downloading Model from https://www.modelscope.cn to directory: C:\Users\ke\.cache\modelscope\hub\models\iic\speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch


2025-08-16 14:40:50,852 - modelscope - INFO - Use user-specified model revision: v2.0.4


Downloading Model from https://www.modelscope.cn to directory: C:\Users\ke\.cache\modelscope\hub\models\iic\speech_fsmn_vad_zh-cn-16k-common-pytorch


2025-08-16 14:40:56,401 - modelscope - INFO - Use user-specified model revision: v2.0.4


Downloading Model from https://www.modelscope.cn to directory: C:\Users\ke\.cache\modelscope\hub\models\iic\punc_ct-transformer_zh-cn-common-vocab272727-pytorch


2025-08-16 14:40:57,967 - modelscope - INFO - Use user-specified model revision: v2.0.4
rtf_avg: 0.079: 100%|[34m██████████[0m| 1/1 [00:03<00:00,  4.00s/it]                                                                                          
rtf_avg: 0.017: 100%|[34m██████████[0m| 5/5 [00:02<00:00,  1.76it/s]
rtf_avg: 0.031: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.87s/it]
rtf_avg: 0.033: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.99s/it]
rtf_avg: 0.033: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.99s/it]
rtf_avg: 0.032: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.96s/it]
rtf_avg: 0.030: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.80s/it]
rtf_avg: 0.033: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  2.00s/it]
rtf_avg: -1.649: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.66s/it]
rtf_avg: 0.031, time_speech:  529.810, time_escape: 16.199: 100%|[31m██████████[0m| 1/1 [00:16<00:00, 16.84s/it]

=== RAW TYPE ===
<class 'list'>





In [6]:

# 安全展开首项
item = res[0] if isinstance(res, list) and res else {}
print("\n=== ITEM KEYS ===")
print(list(item.keys()))



=== ITEM KEYS ===
['key', 'text', 'timestamp']


In [7]:

# 基本字段
print("\n=== BASIC FIELDS ===")
print("key:", item.get("key"))
print("text type:", type(item.get("text")))
if isinstance(item.get("text"), str):
    print("text sample:", item["text"][:120] + ("..." if len(item["text"]) > 120 else ""))



=== BASIC FIELDS ===
key: 大语言模型进化论：从“听懂指令”到“学会思考”，AI如何与人类对齐？
text type: <class 'str'>
text sample: 想象一下啊，人工智能现在哎好像越来越能听懂我们说话了。有时候甚至你给他举个例子，他就能学会一个新任务，让背后到底是怎么回事呢？今天我们就来聊聊你分享的这些资料，看看这些大语言模型啊是怎么一步步进化。从原来那种需要好多定制训练，现在能更灵活的...


In [8]:

# 重点：timestamp 为二维数组 [start_ms, end_ms]
ts = item.get("timestamp")
print("\n=== TIMESTAMP FIELD ===")
print("timestamp type:", type(ts))
if isinstance(ts, list):
    print("timestamp length:", len(ts))
    if ts and isinstance(ts[0], (list, tuple)) and len(ts[0]) == 2:
        print("timestamp[0] (ms):", ts[0])

        # 假设单位是毫秒(ms)，转换为秒(float，保留3位)
        def ms_to_s(ms: int) -> float:
            return round(ms / 1000.0, 3)

        ts_s = [[ms_to_s(p[0]), ms_to_s(p[1])] for p in ts]
        print("timestamp[0] (s):", ts_s[0])

        # 打印前5个区间样例
        print("\nfirst_5_timestamp_intervals_s:")
        for pair in ts_s[:5]:
            print(pair)

        # 可选：导出一个简化结构，便于你进一步处理
        out = {
            "key": item.get("key"),
            "text": item.get("text"),
            "timestamp_ms": ts,         # 原始毫秒
            "timestamp_s": ts_s[:50],   # 只导出前50个，避免过大
        }
        os.makedirs("results", exist_ok=True)
        with open("results/paraformer_raw_probe.json", "w", encoding="utf-8") as f:
            json.dump(out, f, ensure_ascii=False, indent=2)
        print('\nSaved results/paraformer_raw_probe.json')
    else:
        print("timestamp 不是二维 [start_ms, end_ms] 数组，请再贴样例我再适配。")
else:
    print("timestamp 缺失或类型非 list。")


=== TIMESTAMP FIELD ===
timestamp type: <class 'list'>
timestamp length: 2264
timestamp[0] (ms): [350, 590]
timestamp[0] (s): [0.35, 0.59]

first_5_timestamp_intervals_s:
[0.35, 0.59]
[0.65, 0.89]
[0.91, 1.01]
[1.01, 1.25]
[1.25, 1.45]

Saved results/paraformer_raw_probe.json


In [None]:
# coding: utf-8
import os
import re
import json
import torch
from typing import List, Tuple, Dict, Any
from funasr import AutoModel

# 1) 加载并推理（与现有配置一致，音频路径用你的测试文件）
AUDIO_PATH = r"C:\Users\ke\Documents\projects\python_projects\HearSight\backend\tests\datas\大语言模型进化论：从“听懂指令”到“学会思考”，AI如何与人类对齐？.m4a"
assert os.path.exists(AUDIO_PATH), f"文件不存在: {AUDIO_PATH}"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel(
    model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
    model_revision="v2.0.4",
    vad_model="fsmn-vad",
    vad_model_revision="v2.0.4",
    punc_model="ct-punc-c",
    punc_model_revision="v2.0.4",
    device=DEVICE,
    disable_update=True,
)

res = model.generate(input=AUDIO_PATH, cache={}, batch_size_s=300)
item: Dict[str, Any] = res[0] if isinstance(res, list) and res else {}
text: str = item.get("text", "") or ""
timestamps: List[List[int]] = item.get("timestamp", []) or []

# 2) 快速近似：文本分句 + 映射到有声时间累计进度

def split_cn_sentences(text: str) -> List[Tuple[int, int, str]]:
    """
    简单中文分句，返回 [(start_idx, end_idx, sentence)]，end_idx 不含。保留句末标点。
    标点集：。！？；（可按需扩展）
    """
    if not text:
        return []
    spans = []
    start = 0
    for m in re.finditer(r'[。！？；]', text):
        end = m.end()
        sent = text[start:end]
        if sent.strip():
            spans.append((start, end, sent))
        start = end
    if start < len(text):  # 末尾残句
        sent = text[start:]
        if sent.strip():
            spans.append((start, len(text), sent))
    return spans

def map_charpos_to_time_ms(pos: int, text_len: int, seg_ms: List[Tuple[int, int]]) -> int:
    """
    将字符位置 pos (0..text_len) 映射到时间(ms)。
    思路：仅按“有声累计时长”线性分配（忽略静音间隔）。
    """
    if text_len <= 0 or not seg_ms:
        return 0
    total_voiced = sum(e - s for s, e in seg_ms)
    if total_voiced <= 0:
        return seg_ms[0][0]

    target = total_voiced * (pos / text_len)  # 目标有声累计时长
    acc = 0.0
    for s, e in seg_ms:
        dur = e - s
        if dur <= 0:
            continue
        if acc + dur >= target:
            inside = target - acc
            return int(round(s + inside))
        acc += dur
    return seg_ms[-1][1]

def sentences_with_times(text: str, timestamps_ms: List[List[int]]) -> List[Dict[str, Any]]:
    """
    输入：整段 text 与 VAD 窗口 timestamps_ms -> 近似句子时间范围（秒）
    输出：[{sentence, start_s, end_s}]
    """
    seg_ms = [(int(s), int(e)) for s, e in timestamps_ms if isinstance(s, (int, float)) and isinstance(e, (int, float))]
    seg_ms = [(s, e) for s, e in seg_ms if e > s]
    if not text or not seg_ms:
        return []

    sents = split_cn_sentences(text)
    n = len(text)
    out = []
    for s_idx, e_idx, sent in sents:
        st_ms = map_charpos_to_time_ms(s_idx, n, seg_ms)
        ed_ms = map_charpos_to_time_ms(e_idx, n, seg_ms)
        if ed_ms < st_ms:
            ed_ms = st_ms
        out.append({
            "sentence": sent,
            "start_s": round(st_ms / 1000.0, 3),
            "end_s": round(ed_ms / 1000.0, 3),
        })
    return out

# 3) 执行近似并输出结果
sentence_spans = sentences_with_times(text, timestamps)

print(f"total sentences: {len(sentence_spans)}")
for i, sp in enumerate(sentence_spans[:5]):  # 只看前5条样例
    print(i, sp)

os.makedirs("results", exist_ok=True)
with open("results/paraformer_sentence_times.json", "w", encoding="utf-8") as f:
    json.dump(sentence_spans, f, ensure_ascii=False, indent=2)
print("Saved to results/paraformer_sentence_times.json")


funasr version: 1.2.6.
Downloading Model from https://www.modelscope.cn to directory: C:\Users\ke\.cache\modelscope\hub\models\iic\speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch


2025-08-16 16:11:49,224 - modelscope - INFO - Use user-specified model revision: v2.0.4


Downloading Model from https://www.modelscope.cn to directory: C:\Users\ke\.cache\modelscope\hub\models\iic\speech_fsmn_vad_zh-cn-16k-common-pytorch


2025-08-16 16:11:54,947 - modelscope - INFO - Use user-specified model revision: v2.0.4


Downloading Model from https://www.modelscope.cn to directory: C:\Users\ke\.cache\modelscope\hub\models\iic\punc_ct-transformer_zh-cn-common-vocab272727-pytorch


2025-08-16 16:11:57,465 - modelscope - INFO - Use user-specified model revision: v2.0.4
rtf_avg: 0.072: 100%|[34m██████████[0m| 1/1 [00:03<00:00,  3.61s/it]                                                                                          
rtf_avg: 0.014: 100%|[34m██████████[0m| 5/5 [00:02<00:00,  2.04it/s]
rtf_avg: 0.027: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.64s/it]
rtf_avg: 0.027: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.65s/it]
rtf_avg: 0.027: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.61s/it]
rtf_avg: 0.027: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.64s/it]
rtf_avg: 0.027: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.61s/it]
rtf_avg: 0.029: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.74s/it]
rtf_avg: -1.381: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.39s/it]
rtf_avg: 0.026, time_speech:  529.810, time_escape: 13.836: 100%|[31m██████████[0m| 1/1 [00:14<00:00, 14.42s/it]

total sentences: 70
0 {'sentence': '想象一下啊，人工智能现在哎好像越来越能听懂我们说话了。', 'start_s': 0.35, 'end_s': 5.75}
1 {'sentence': '有时候甚至你给他举个例子，他就能学会一个新任务，让背后到底是怎么回事呢？', 'start_s': 5.75, 'end_s': 13.084}
2 {'sentence': '今天我们就来聊聊你分享的这些资料，看看这些大语言模型啊是怎么一步步进化。', 'start_s': 13.084, 'end_s': 20.357}
3 {'sentence': '从原来那种需要好多定制训练，现在能更灵活的学习和推理的。', 'start_s': 20.357, 'end_s': 25.746}
4 {'sentence': '这里面呢有几个关键的技术点，让AI变得越来越聪明了。', 'start_s': 25.746, 'end_s': 30.738}
Saved to results/paraformer_sentence_times.json



