# Inference with NVILA

## Install this package

```
git clone git@github.com:kehanlu/DeSTA2.git
cd DeSTA2
pip install -e .
```


In [None]:
import os
from huggingface_hub import snapshot_download, hf_hub_download

## Download LLM ##
path = snapshot_download(repo_id="Efficient-Large-Model/NVILA-8B", cache_dir="/NeMo/.cache")
llm_path = os.path.join(path, "llm") # We only use the LLM part of the model
print(llm_path) 

## Download Qformer ckpt ##
# qformer_ckpt_path = hf_hub_download(repo_id="kehanlu/share", filename="DeSTA2/250218-09-desta-nvila-8B.pth") # from 10 epoch
qformer_ckpt_path = hf_hub_download(repo_id="kehanlu/share", filename="DeSTA2/250218-09-desta-nvila-8B-5ep.pth") # from 5 epoch
print(qformer_ckpt_path)



In [None]:
from desta.modeling_desta import Desta2Config, DestaModel
import torch
config = Desta2Config(
    llama_model_id=llm_path,
    whisper_model_id="openai/whisper-medium",
    prompt_size=64,
)
model = DestaModel(config)
model.speech_perception.connector.load_state_dict(
    torch.load(qformer_ckpt_path)
)
model.to("cuda:1")

In [None]:
# you can change the system prompt
# the audio format: [[Audio: /path/to/audio]]

messages = [
            {"role": "system", "content": "Follow the instruction."},
            {"role": "user", "content": """[[Audio:/NeMo/workspace/DeSTA2/assets/audios/4_0_d47.wav]]
             
Transcribe the speech.
"""}]

outputs = model.chat_multiple_audios(messages,
                                     do_sample=False,
                                     temperature=1.0,
                                     top_p=1.0,
                                     max_new_tokens=256,
                                     audio_template="<start_audio>!!!!!!!!{transcription}<end_audio>",
                                     audio_placeholder="!!!!!!!!")

print(model.tokenizer.decode(outputs[0]))