In [None]:
%cd /path/to/workspace

/root/Ming-Lite-UniAudio


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


# Initialize

In [None]:
import warnings
from peft import PeftModel
import torch
from transformers import AutoProcessor
import os
import sys
from IPython.display import display
import ipynbname
notebook_path = ipynbname.path()
current_dir = os.path.dirname(notebook_path)
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from modeling_bailingmm import BailingMMNativeForConditionalGeneration
import random
import numpy as np
from loguru import logger
from sentence_manager.sentence_manager import SentenceNormalizer
import re
import yaml

def seed_everything(seed=1895):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()
warnings.filterwarnings("ignore")

class MingAudio:
    def __init__(self, model_path, lora_path=None, device="cuda:0", use_grouped_gemm=True):
        self.device = device
        self.model = BailingMMNativeForConditionalGeneration.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
        ).to(self.device)

        if use_grouped_gemm and not self.model.config.llm_config.use_grouped_gemm:
            self.model.model.fuse_experts()

        if lora_path is not None:
            self.model = PeftModel.from_pretrained(self.model, lora_path)
        self.model = self.model.eval().to(torch.bfloat16).to(self.device)
        self.processor = AutoProcessor.from_pretrained(".", trust_remote_code=True)
        self.tokenizer = self.processor.tokenizer
        self.sample_rate = self.processor.audio_processor.sample_rate
        self.patch_size = self.processor.audio_processor.patch_size
        self.normalizer = self.init_tn_normalizer(tokenizer=self.tokenizer)

    def init_tn_normalizer(self, config_file_path=None, tokenizer=None):

        if config_file_path is None:
            default_config_path = "sentence_manager/default_config.yaml"
            config_file_path = default_config_path
        with open(config_file_path, 'r') as f:
            self.sentence_manager_config = yaml.safe_load(f)
        if "split_token" not in self.sentence_manager_config:
            self.sentence_manager_config["split_token"] = []
        assert isinstance(self.sentence_manager_config["split_token"], list)
        if tokenizer is not None:
            self.sentence_manager_config["split_token"].append(re.escape(tokenizer.eos_token))
        normalizer = SentenceNormalizer(self.sentence_manager_config.get("text_norm", {}))
        
        return normalizer

    def speech_understanding(self, messages, lang=None):
        text = self.processor.apply_chat_template(messages, add_generation_prompt=True)
        image_inputs, video_inputs, audio_inputs = self.processor.process_vision_info(messages)

        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            audios=audio_inputs,
            return_tensors="pt",
        ).to(self.device)
        
        if lang is not None:
            language = torch.tensor([self.tokenizer.encode(f'{lang}\t')]).to(inputs['input_ids'].device)
            inputs['input_ids'] = torch.cat([inputs['input_ids'], language], dim=1)
            attention_mask = inputs['attention_mask']
            inputs['attention_mask'] = torch.ones(inputs['input_ids'].shape, dtype=attention_mask.dtype)
        for k in inputs.keys():
            if k == "pixel_values" or k == "pixel_values_videos" or k == "audio_feats":
                inputs[k] = inputs[k].to(dtype=torch.bfloat16)
        logger.info(f"input: {self.tokenizer.decode(inputs['input_ids'].cpu().numpy().tolist()[0])}")

        generated_ids = self.model.generate(
            **inputs,
            max_new_tokens=512,
            eos_token_id=self.processor.gen_terminator,
        )
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = self.processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )[0]

        return output_text

    def speech_generation(
        self, 
        text,
        prompt_wav_path,
        prompt_text,
        lang='zh',
        output_wav_path='out.wav'
    ):
        text = self.normalizer.normalize(text)
        waveform = self.model.generate_tts(
            text=text,
            prompt_wav_path=prompt_wav_path,
            prompt_text=prompt_text,
            patch_size=self.patch_size,
            tokenizer=self.tokenizer,
            lang=lang,
            output_wav_path=output_wav_path,
            sample_rate=self.sample_rate,
            device=self.device
        )
        
        return waveform

    def speech_edit(
        self, 
        messages,
        output_wav_path='out.wav',
        use_cot=True
    ):
        text = self.processor.apply_chat_template(messages, add_generation_prompt=True)
        image_inputs, video_inputs, audio_inputs = self.processor.process_vision_info(messages)

        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            audios=audio_inputs,
            return_tensors="pt",
        ).to(self.device)

        if use_cot:
            ans = torch.tensor([self.tokenizer.encode('<answer>')]).to(inputs['input_ids'].device)
            inputs['input_ids'] = torch.cat([inputs['input_ids'], ans], dim=1)
            attention_mask = inputs['attention_mask']
            inputs['attention_mask'] = torch.ones(inputs['input_ids'].shape, dtype=attention_mask.dtype)
        for k in inputs.keys():
            if k == "pixel_values" or k == "pixel_values_videos" or k == "audio_feats":
                inputs[k] = inputs[k].to(dtype=torch.bfloat16)
        logger.info(f"input: {self.tokenizer.decode(inputs['input_ids'].cpu().numpy().tolist()[0])}")

        edited_speech, edited_text = self.model.generate_edit(
            **inputs,
            tokenizer=self.tokenizer,
            output_wav_path=output_wav_path
        )
        return edited_speech, edited_text


if __name__ == "__main__":
    # model = MingAudio("/path/to/model")
    # load base model
    model = MingAudio("inclusionAI/Ming-UniAudio-16B-A3B")


[2025-10-27 15:05:24,009] [INFO] [add_hooks.py:71:__init__] If you are working on AIStudio, please set 'ANTMONITOR_TFEVENT_PATH' env.
2025-10-27 15:05:24,930 - datasets - INFO - PyTorch version 2.6.0 available.
BailingMMNativeForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of the model checkpoint at inclusionAI/Ming-UniAudio-16B-A3B were not used when initializing BailingMMNativeForConditionalGeneration: ['audio.decoder.semantic_model.conv1.bias', 'audio.decoder.semantic_model.conv1.weight', 'audio.decoder.semantic_model.conv2.bias', 'audio.decoder.semantic_model.conv2.weight', 'audio.decoder.semantic_model.positional_embedding']
- This IS expected if you are initializing BailingMMNativeForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BailingMMNativeForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
BailingMMNativeForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is exp

# Examples

In [None]:

    # ASR
    messages = [
        {
            "role": "HUMAN",
            "content": [
                {
                    "type": "text",
                    "text": "Please recognize the language of this speech and transcribe it. Format: oral.",
                },
                
                {"type": "audio", "audio": "data/wavs/BAC009S0915W0292.wav"},
            ],
        },
    ]
    
    response = model.speech_understanding(messages=messages)
    logger.info(f"Generated Response: {response}")


[32m2025-09-30 20:54:26.673[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeech_understanding[0m:[36m56[0m - [1minput: <role>HUMAN</role>Please recognize the language of this speech and transcribe it. Format: oral.<audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch></audio><role>ASSISTANT</role>[0m
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
[32m2025-09-30 20:54:28.285[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m17[0m - [1mGenerated Response: Chinese	现在是不是也该长点心了吧[0m


In [None]:

    # Dialect ASR
    messages = [
        {
            "role": "HUMAN",
            "content": [
                {
                    "type": "text",
                    "text": "Please recognize the language of this speech and transcribe it. Format: oral.",
                },
                
                {"type": "audio", "audio": "data/wavs/chuanyu_demo.wav"},
            ],
        }
    ]
    
    response = model.speech_understanding(messages=messages, lang="川渝")
    logger.info(f"Generated Response: {response}")

    messages = [
        {
            "role": "HUMAN",
            "content": [
                {
                    "type": "text",
                    "text": "Please recognize the language of this speech and transcribe it. Format: oral.",
                },
                
                {"type": "audio", "audio": "data/wavs/hunan_demo.wav"},
            ],
        }
    ]
    
    response = model.speech_understanding(messages=messages, lang="湖南")
    logger.info(f"Generated Response: {response}")

    messages = [
        {
            "role": "HUMAN",
            "content": [
                {
                    "type": "text",
                    "text": "Please recognize the language of this speech and transcribe it. Format: oral.",
                },
                
                {"type": "audio", "audio": "data/wavs/minnan_demo.wav"},
            ],
        }
    ]
    
    response = model.speech_understanding(messages=messages, lang="闽南")
    logger.info(f"Generated Response: {response}")


    messages = [
        {
            "role": "HUMAN",
            "content": [
                {
                    "type": "text",
                    "text": "Please recognize the language of this speech and transcribe it. Format: oral.",
                },
                
                {"type": "audio", "audio": "data/wavs/shanghai_demo.wav"},
            ],
        }
    ]
    
    response = model.speech_understanding(messages=messages, lang="上海")
    logger.info(f"Generated Response: {response}")


    messages = [
        {
            "role": "HUMAN",
            "content": [
                {
                    "type": "text",
                    "text": "Please recognize the language of this speech and transcribe it. Format: oral.",
                },
                
                {"type": "audio", "audio": "data/wavs/yueyu_demo.wav"},
            ],
        }
    ]
    
    response = model.speech_understanding(messages=messages, lang="Canton")
    logger.info(f"Generated Response: {response}")

[32m2025-10-01 21:47:27.460[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeech_understanding[0m:[36m56[0m - [1minput: <role>HUMAN</role>Please recognize the language of this speech and transcribe it. Format: oral.<audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch></audio><role>ASSISTANT</role>川渝	[0m
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
[32m2025-10-01 21:47:28.944[0m | [1mINFO    [0m

In [None]:

    # TTS
    waveform = model.speech_generation(
        text='我们的愿景是构建未来服务业的数字化基础设施，为世界带来更多微小而美好的改变。',
        prompt_wav_path='data/wavs/10002287-00000094.wav',
        prompt_text='在此奉劝大家别乱打美白针。',
        output_wav_path='data/output/tts.wav',
    )
    logger.info(f"waveform: {waveform}")

evaluating zh


  0%|          | 0/300 [00:00<?, ?it/s]

<role>HUMAN</role>Please translate the text to speech.
在此奉劝大家别乱打美白针。我们的愿景是构建未来服务业的数字化基础设施，为世界带来更多微小而美好的改变。<role>ASSISTANT</role><audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch>


 32%|███▏      | 96/300 [00:13<00:27,  7.37it/s]
[32m2025-09-30 20:56:01.995[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mwaveform: tensor([[ 5.7267e-04,  1.2557e-03,  8.2468e-04,  ..., -2.2724e-06,
         -1.6844e-06, -2.2619e-05]])[0m


StopInfo: 96 299


In [None]:
    # load sft model
    model = MingAudio("inclusionAI/Ming-UniAudio-16B-A3B-Edit")
    # Ins
    messages = [
        {
            "role": "HUMAN",
            "content": [
                {"type": "audio", "audio": "data/wavs/00004768-00000024.wav", "target_sample_rate": 16000},
                {
                    "type": "text",
                    "text": "<prompt>Please recognize the language of this speech and transcribe it. And insert '实现' before the character or word at index 3.\n</prompt>",
                },
            ],
        },
    ]
    
    response = model.speech_edit(messages=messages, output_wav_path="data/output/ins.wav")
    logger.info(f"Generated Response: {response}")

[32m2025-09-30 20:57:03.366[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeech_edit[0m:[36m119[0m - [1minput: <role>HUMAN</role><audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch></audio><prompt>Please recognize the language of this speech and transcribe it. And insert '实现' before the character or word at index 3.
</prompt><role>ASSISTANT</role><answer>[0m
[32m2025-09-30 20:57:10.166[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mGenerated Response: (tensor([[[-0.0006, -0.0005, -0.0001,  ...,  0.0016,  0.0013,  0.00

In [None]:

    # Del
    messages = [
        {
            "role": "HUMAN",
            "content": [
                {"type": "audio", "audio": "data/wavs/00004768-00000024.wav", "target_sample_rate": 16000},
                {
                    "type": "text",
                    "text": "<prompt>Please recognize the language of this speech and transcribe it. And delete the characters or words from index 5 to index 8.\n</prompt>",
                },
            ],
        },
    ]
    
    response = model.speech_edit(messages=messages, output_wav_path="data/output/del.wav")
    logger.info(f"Generated Response: {response}")

[32m2025-09-30 21:01:13.133[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeech_edit[0m:[36m119[0m - [1minput: <role>HUMAN</role><audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch></audio><prompt>Please recognize the language of this speech and transcribe it. And delete the characters or words from index 5 to index 8.
</prompt><role>ASSISTANT</role><answer>[0m
[32m2025-09-30 21:01:17.725[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mGenerated Response: (tensor([[[-0.0020, -0.0023, -0.0018,  ..., -0.0025, -0.0023, -0

In [None]:

    # Sub
    messages = [
        {
            "role": "HUMAN",
            "content": [
                {"type": "audio", "audio": "data/wavs/00004768-00000024.wav", "target_sample_rate": 16000},
                {
                    "type": "text",
                    "text": "<prompt>Please recognize the language of this speech and transcribe it. And substitute '盘活' with '创造'.\n</prompt>",
                },
            ],
        },
    ]
    
    response = model.speech_edit(messages=messages, output_wav_path="data/output/sub.wav")
    logger.info(f"Generated Response: {response}")

[32m2025-09-30 21:02:49.760[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeech_edit[0m:[36m119[0m - [1minput: <role>HUMAN</role><audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch></audio><prompt>Please recognize the language of this speech and transcribe it. And substitute '盘活' with '创造'.
</prompt><role>ASSISTANT</role><answer>[0m
[32m2025-09-30 21:02:55.734[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mGenerated Response: (tensor([[[-0.0007, -0.0006, -0.0002,  ...,  0.0024,  0.0020,  0.0019]]],
       device='cuda

In [None]:

    # Denoise
    messages = [
        {
            "role": "HUMAN",
            "content": [
                {"type": "audio", "audio": "data/wavs/noreverb_fileid_0.wav", "target_sample_rate": 16000},
                {
                    "type": "text",
                    "text": "<prompt>Please recognize the language of this speech and transcribe it. And denoise the audio.\n</prompt>",
                },
            ],
        },
    ]
    
    response = model.speech_edit(messages=messages, use_cot=False, output_wav_path="data/output/denoise.wav")
    logger.info(f"Generated Response: {response}")

[32m2025-09-30 21:05:08.922[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeech_edit[0m:[36m119[0m - [1minput: <role>HUMAN</role><audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><aud

In [None]:
    # time_stretch
    messages = [
        {
            "role": "HUMAN",
            "content": [
                {"type": "audio", "audio": "data/wavs/00004768-00000024.wav", "target_sample_rate": 16000},
                {
                    "type": "text",
                    "text": "<prompt>Please recognize the language of this speech and transcribe it. And adjusts the speed to 0.7.\n</prompt>",
                },
            ],
        },
    ]
    
    response = model.speech_edit(messages=messages, use_cot=False, output_wav_path="data/output/time_stretch.wav")
    logger.info(f"Generated Response: {response}")

[32m2025-09-30 21:07:10.332[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeech_edit[0m:[36m119[0m - [1minput: <role>HUMAN</role><audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch></audio><prompt>Please recognize the language of this speech and transcribe it. And adjusts the speed to 0.7.
</prompt><role>ASSISTANT</role>[0m
[32m2025-09-30 21:07:17.855[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mGenerated Response: (tensor([[[ 0.0004,  0.0006,  0.0008,  ..., -0.0003, -0.0005, -0.0004]]],
       device='cuda:0'), '<g

In [None]:
    # pitch_shift
    messages = [
        {
            "role": "HUMAN",
            "content": [
                {"type": "audio", "audio": "data/wavs/00004768-00000024.wav", "target_sample_rate": 16000},
                {
                    "type": "text",
                    "text": "<prompt>Please recognize the language of this speech and transcribe it. And shifts the pitch by 3 steps.\n</prompt>",
                },
            ],
        },
    ]

    response = model.speech_edit(messages=messages, use_cot=False, output_wav_path="data/output/pitch_shift.wav")
    logger.info(f"Generated Response: {response}")

[32m2025-09-30 21:08:56.162[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeech_edit[0m:[36m119[0m - [1minput: <role>HUMAN</role><audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch></audio><prompt>Please recognize the language of this speech and transcribe it. And shifts the pitch by 3 steps.
</prompt><role>ASSISTANT</role>[0m
[32m2025-09-30 21:09:01.875[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mGenerated Response: (tensor([[[-3.8837e-04, -2.6293e-04,  1.8646e-04,  ...,  4.6090e-05,
           6.5749e-05,  1.7663

In [None]:

    # vol
    messages = [
        {
            "role": "HUMAN",
            "content": [
                {"type": "audio", "audio": "data/wavs/00004768-00000024.wav", "target_sample_rate": 16000},
                {
                    "type": "text",
                    "text": "<prompt>Please recognize the language of this speech and transcribe it. And adjusts the volume to 0.6.\n</prompt>",
                },
            ],
        },
    ]

    response = model.speech_edit(messages=messages, use_cot=False, output_wav_path="data/output/vol.wav")
    logger.info(f"Generated Response: {response}")

[32m2025-09-30 21:10:09.258[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeech_edit[0m:[36m119[0m - [1minput: <role>HUMAN</role><audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch></audio><prompt>Please recognize the language of this speech and transcribe it. And adjusts the volume to 0.6.
</prompt><role>ASSISTANT</role>[0m
[32m2025-09-30 21:10:14.946[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mGenerated Response: (tensor([[[-9.3645e-04, -9.3372e-04, -6.7022e-04,  ...,  2.7338e-05,
           2.5592e-05, -3.1757e-

In [None]:

    # add sound
    messages = [
        {
            "role": "HUMAN",
            "content": [
                {"type": "audio", "audio": "data/wavs/00004768-00000024.wav", "target_sample_rate": 16000},
                {
                    "type": "text",
                    "text": "<prompt>Please recognize the language of this speech and transcribe it. And add rain to audio.\n</prompt>",
                },
            ],
        },
    ]

    response = model.speech_edit(messages=messages, use_cot=False, output_wav_path="data/output/add_sound.wav")
    logger.info(f"Generated Response: {response}")

[32m2025-09-30 21:13:06.239[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeech_edit[0m:[36m119[0m - [1minput: <role>HUMAN</role><audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch></audio><prompt>Please recognize the language of this speech and transcribe it. And add rain to audio.
</prompt><role>ASSISTANT</role>[0m
[32m2025-09-30 21:13:11.905[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mGenerated Response: (tensor([[[-0.0146,  0.0316,  0.0304,  ..., -0.0022, -0.0320, -0.0388]]],
       device='cuda:0'), '<gen_audi

In [None]:

    # Context ASR
    messages = [
        {
            "role": "HUMAN",
            "content": [
                {
                    "type": "text",
                    "text": "Please recognize the language of this speech and transcribe it. Format: oral. This audio may contains the following words or phrases:Project Almanac,Dean Israelite,Jonny Weston,Sofia Black D' Elia,Sam Lerner,Jessie,Quinn,C G I,reverse engineer,timeline",
                },
                
                {"type": "audio", "audio": "data/wavs/11302-4_1712960-1908016.wav"},
            ],
        },
    ]
    
    response = model.speech_understanding(messages=messages)
    logger.info(f"Generated Response: {response}")


[32m2025-09-30 21:11:39.334[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeech_understanding[0m:[36m56[0m - [1minput: <role>HUMAN</role>Please recognize the language of this speech and transcribe it. Format: oral. This audio may contains the following words or phrases:Project Almanac,Dean Israelite,Jonny Weston,Sofia Black D' Elia,Sam Lerner,Jessie,Quinn,C G I,reverse engineer,timeline<audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPa

In [None]:

    # emotion
    messages = [
        {
            "role": "HUMAN",
            "content": [
                {"type": "audio", "audio": "data/wavs/emotion_00004753-00000079.wav", "target_sample_rate": 16000},
                {
                    "type": "text",
                    "text": "<prompt>Please recognize the language of this speech and transcribe it. And change the emotion to happy mood.\n</prompt>",
                },
            ],
        },
    ]

    response = model.speech_edit(messages=messages, use_cot=False, output_wav_path="data/output/emotion.wav")
    logger.info(f"Generated Response: {response}")

[32m2025-09-30 21:21:42.112[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeech_edit[0m:[36m119[0m - [1minput: <role>HUMAN</role><audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch></audio><prompt>Please recognize the language of this speech and transcribe it. And change the emotion to happy mood.
</prompt><role>ASSISTANT</role>[0m
[32m2025-09-30 21:21:47.384[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mGenerated Response: (tensor([[[0.0021, 0.0027, 0.0026,  ..., 0.000

In [None]:

    # dialect conversion
    messages = [
        {
            "role": "HUMAN",
            "content": [
                {"type": "audio", "audio": "data/wavs/emotion_00004753-00000079.wav", "target_sample_rate": 16000},
                {
                    "type": "text",
                    "text": "<prompt>Please recognize the language of this speech and transcribe it. And change the accent of the speech to Chengdu.\n</prompt>",
                },
            ],
        },
    ]

    response = model.speech_edit(messages=messages, use_cot=False, output_wav_path="data/output/dialect_conversion.wav")
    logger.info(f"Generated Response: {response}")

[32m2025-09-30 21:24:02.801[0m | [1mINFO    [0m | [36m__main__[0m:[36mspeech_edit[0m:[36m119[0m - [1minput: <role>HUMAN</role><audio><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch><audioPatch></audio><prompt>Please recognize the language of this speech and transcribe it. And change the accent of the speech to Chengdu.
</prompt><role>ASSISTANT</role>[0m
[32m2025-09-30 21:24:08.595[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mGenerated Response: (tensor([[[ 0.0005,  0.0007,  0.0010