```
pip install torch pillow transformers soundfile librosa vector-quantize-pytorch vocos accelerate
```

In [1]:
import torch
from PIL import Image
import os
from transformers import AutoModel, AutoTokenizer

# load omni model default, the default init_vision/init_audio/init_tts is True
# if load vision-only model, please set init_audio=False and init_tts=False
# if load audio-only model, please set init_vision=False
path = "/Volumes/SSD/code-ssd/AI_Project/inferfence/model/MiniCPM-o-26"
#path = "model/MiniCPM3-4B"
model = AutoModel.from_pretrained(
    path,
    trust_remote_code=True,
    attn_implementation='sdpa', # sdpa or flash_attention_2
    torch_dtype=torch.bfloat16,
    init_vision=True,
    init_audio=True,
    init_tts=True,
    local_files_only=True
)

model = model.to('mps')
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True,local_files_only=True)



  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  8.00it/s]


需要修改 modeling_minicpmo.py 里的cuda 转换成 cpu or mps

In [2]:

# In addition to vision-only mode, tts processor and vocos also needs to be initialized
model.init_tts()


```
 pip install moviepy==1.0.3 
 pip install transformers==4.44.2
 安装固定版本，否则会出现类找不到的问题
```

In [6]:
import math
import numpy as np
from PIL import Image
from moviepy.editor import VideoFileClip
import tempfile
import librosa
import soundfile as sf

def get_video_chunk_content(video_path, flatten=True):
    video = VideoFileClip(video_path)
    print('video_duration:', video.duration)
    
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
        temp_audio_file_path = temp_audio_file.name
        video.audio.write_audiofile(temp_audio_file_path, codec="pcm_s16le", fps=16000)
        audio_np, sr = librosa.load(temp_audio_file_path, sr=16000, mono=True)
    num_units = math.ceil(video.duration)
    
    # 1 frame + 1s audio chunk
    contents= []
    for i in range(num_units):
        frame = video.get_frame(i+1)
        image = Image.fromarray((frame).astype(np.uint8))
        audio = audio_np[sr*i:sr*(i+1)]
        if flatten:
            contents.extend(["<unit>", image, audio])
        else:
            contents.append(["<unit>", image, audio])
    return contents

path = "/Volumes/SSD/code-ssd/AI_Project/inferfence/model/MiniCPM-o-26"
video_path= path  + "/assets/Skiing.mp4"
# if use voice clone prompt, please set ref_audio
ref_audio_path = path  +'/assets/demo.wav'
ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
sys_msg = model.get_sys_prompt(ref_audio=ref_audio, mode='omni', language='en')
print(sys_msg)
# or use default prompt
# sys_msg = model.get_sys_prompt(mode='omni', language='en')


{'role': 'user', 'content': ['You are a helpful assistant. You can accept video, audio and text input and output voice and text. Clone the voice in the provided audio prompt.', array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
       -3.2558626e-05, -1.0545147e-05,  2.7744500e-06], dtype=float32), 'As an assistant, you will speak using this voice style.']}


In [None]:

contents = get_video_chunk_content(video_path)
msg = {"role":"user", "content": contents}
msgs = [sys_msg, msg]

# please set generate_audio=True and output_audio_path to save the tts result
generate_audio = True
output_audio_path = 'output.wav'

res = model.chat(
    msgs=msgs,
    tokenizer=tokenizer,
    sampling=True,
    temperature=0.5,
    max_new_tokens=4096,
    omni_input=True, # please set omni_input=True when omni inference
    use_tts_template=True,
    generate_audio=generate_audio,
    output_audio_path=output_audio_path,
    max_slice_nums=1,
    use_image_id=False,
    return_dict=True
)
print(res)