In [None]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "notebooks":
    os.chdir(cwd.parent)

In [2]:
import soundfile as sf
# Assuming the KimiAudio class is available after installation
from kimia_infer.api.kimia import KimiAudio
import torch # Ensure torch is imported if needed for device placement

# --- 1. Load Model ---
# Load the model from Hugging Face Hub
# Make sure you are logged in (`huggingface-cli login`) if the repo is private.
model_id = "moonshotai/Kimi-Audio-7B-Instruct" # Or "Kimi/Kimi-Audio-7B"
device = "cuda" if torch.cuda.is_available() else "cpu" # Example device placement
# Note: The KimiAudio class might handle model loading differently.
# You might need to pass the model_id directly or download checkpoints manually
# and provide the local path as shown in the original readme_kimia.md.
# Please refer to the main Kimi-Audio repository for precise loading instructions.
# Example assuming KimiAudio takes the HF ID or a local path:
try:
    model = KimiAudio(model_path=model_id, load_detokenizer=True) # May need device argument
    model.to(device) # Example device placement
except Exception as e:
    print(f"Automatic loading from HF Hub might require specific setup.")
    print(f"Refer to Kimi-Audio docs. Trying local path example (update path!). Error: {e}")
    # Fallback example:
    # model_path = "/path/to/your/downloaded/kimia-hf-ckpt" # IMPORTANT: Update this path if loading locally
    # model = KimiAudio(model_path=model_path, load_detokenizer=True)
    # model.to(device) # Example device placement

# --- 2. Define Sampling Parameters ---
sampling_params = {
    "audio_temperature": 0.8,
    "audio_top_k": 10,
    "text_temperature": 0.0,
    "text_top_k": 5,
    "audio_repetition_penalty": 1.0,
    "audio_repetition_window_size": 64,
    "text_repetition_penalty": 1.0,
    "text_repetition_window_size": 16,
}

# --- 3. Example 1: Audio-to-Text (ASR) ---
# TODO: Provide actual example audio files or URLs accessible to users
# E.g., download sample files first or use URLs
# wget https://path/to/your/asr_example.wav -O asr_example.wav
# wget https://path/to/your/qa_example.wav -O qa_example.wav
asr_audio_path = "asr_example.wav" # IMPORTANT: Make sure this file exists
qa_audio_path = "qa_example.wav" # IMPORTANT: Make sure this file exists

messages_asr = [
    {"role": "user", "message_type": "text", "content": "Please transcribe the following audio:"},
    {"role": "user", "message_type": "audio", "content": asr_audio_path}
]

# Generate only text output
# Note: Ensure the model object and generate method accept device placement if needed
_, text_output = model.generate(messages_asr, **sampling_params, output_type="text")
print(">>> ASR Output Text: ", text_output)
# Expected output: "这并不是告别，这是一个篇章的结束，也是新篇章的开始。" (Example)

[32m2026-01-18 15:53:28.598[0m | [1mINFO    [0m | [36mkimia_infer.api.kimia[0m:[36m__init__[0m:[36m16[0m - [1mLoading kimi-audio main model[0m


Fetching 64 files:   0%|          | 0/64 [00:00<?, ?it/s]

config.yaml: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

configuration_moonshot_kimia.py: 0.00B [00:00, ?B/s]

generation_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

audio_detokenizer/model.pt:   0%|          | 0.00/19.0G [00:00<?, ?B/s]

model-1-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-10-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-12-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-11-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-13-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-14-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-15-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-16-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-17-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-18-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-19-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-2-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-20-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-21-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-22-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-23-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-24-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-25-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-26-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-27-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-28-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-29-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-3-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-30-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-31-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-32-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-33-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-34-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-35-of-35.safetensors:   0%|          | 0.00/62.4M [00:00<?, ?B/s]

model-36-of-36.safetensors:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

model-4-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-5-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-6-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-7-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-8-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model-9-of-35.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

modeling_moonshot_kimia.py: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tiktoken.model:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

tokenization_kimia.py: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocoder/model.pt:   0%|          | 0.00/965M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

whisper-large-v3/model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

[32m2026-01-18 15:58:59.614[0m | [1mINFO    [0m | [36mkimia_infer.api.kimia[0m:[36m__init__[0m:[36m25[0m - [1mLooking for resources in /nfs/home/dasaro/.cache/huggingface/hub/models--moonshotai--Kimi-Audio-7B-Instruct/snapshots/9a82a84c37ad9eb1307fb6ed8d7b397862ef9e6b[0m
[32m2026-01-18 15:58:59.615[0m | [1mINFO    [0m | [36mkimia_infer.api.kimia[0m:[36m__init__[0m:[36m26[0m - [1mLoading whisper model[0m
`torch_dtype` is deprecated! Use `dtype` instead!
using normal flash attention


Loading checkpoint shards:   0%|          | 0/36 [00:00<?, ?it/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

[32m2026-01-18 15:59:42.590[0m | [1mINFO    [0m | [36mkimia_infer.api.prompt_manager[0m:[36m__init__[0m:[36m20[0m - [1mLooking for resources in /nfs/home/dasaro/.cache/huggingface/hub/models--moonshotai--Kimi-Audio-7B-Instruct/snapshots/9a82a84c37ad9eb1307fb6ed8d7b397862ef9e6b[0m
[32m2026-01-18 15:59:42.592[0m | [1mINFO    [0m | [36mkimia_infer.api.prompt_manager[0m:[36m__init__[0m:[36m21[0m - [1mLoading whisper model[0m
[32m2026-01-18 15:59:48.021[0m | [1mINFO    [0m | [36mkimia_infer.api.prompt_manager[0m:[36m__init__[0m:[36m30[0m - [1mLoading text tokenizer[0m
[32m2026-01-18 15:59:48.320[0m | [1mINFO    [0m | [36mkimia_infer.api.kimia[0m:[36m__init__[0m:[36m41[0m - [1mLoading detokenizer[0m
Detected CUDA files, patching ldflags
Emitting ninja build file /nfs/home/dasaro/research/FairMLLM-Emotion-Recognition/.kimi_venv/lib/python3.10/site-packages/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/build/build.ninja...
If 

[1/3] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output anti_alias_activation_cuda.cuda.o.d -DTORCH_EXTENSION_NAME=anti_alias_activation_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /nfs/home/dasaro/research/FairMLLM-Emotion-Recognition/.kimi_venv/lib/python3.10/site-packages/torch/include -isystem /nfs/home/dasaro/research/FairMLLM-Emotion-Recognition/.kimi_venv/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /nfs/home/dasaro/research/FairMLLM-Emotion-Recognition/.kimi_venv/lib/python3.10/site-packages/torch/include/TH -isystem /nfs/home/dasaro/research/FairMLLM-Emotion-Recognition/.kimi_venv/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /nfs/home/dasaro/.pyenv/versions/3.10.12/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__

NameError: name 'model' is not defined