In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch
from vllm import AsyncLLMEngine, LLM, SamplingParams
from transformers import AutoTokenizer
from snac import SNAC
from orpheus_vllm import special_tokens as ST

In [7]:
model_path = "/mnt/d/OrpheusTTS-checkpoints/merged_checkpoint"
tokenizer = "canopylabs/orpheus-3b-0.1-ft"
dtype = torch.bfloat16
device = "cuda"

In [37]:
snac_model = snac_model.to("cpu")

In [8]:
vllm_model = LLM(model=model_path,
                      max_model_len=8192,
                      dtype=dtype,
                      device=device)
tokenizer = AutoTokenizer.from_pretrained(tokenizer)
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cpu")
start_token = [ST.SOH]
end_tokens = [ST.EOT, ST.EOH, ST.SOA, ST.SOS]

INFO 10-21 10:15:34 __init__.py:207] Automatically detected platform cuda.


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 10-21 10:15:41 config.py:549] This model supports multiple tasks: {'generate', 'embed', 'classify', 'score', 'reward'}. Defaulting to 'generate'.
INFO 10-21 10:15:41 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/mnt/d/OrpheusTTS-checkpoints/merged_checkpoint', speculative_config=None, tokenizer='/mnt/d/OrpheusTTS-checkpoints/merged_checkpoint', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_m

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 10-21 10:16:11 model_runner.py:1115] Loading model weights took 6.1801 GB
INFO 10-21 10:16:13 worker.py:267] Memory profiling takes 1.62 seconds
INFO 10-21 10:16:13 worker.py:267] the current vLLM instance can use total_gpu_memory (11.99GiB) x gpu_memory_utilization (0.90) = 10.79GiB
INFO 10-21 10:16:13 worker.py:267] model weights take 6.18GiB; non_torch_memory takes 0.04GiB; PyTorch activation peak memory takes 1.48GiB; the rest of the memory reserved for KV Cache is 3.10GiB.
INFO 10-21 10:16:13 executor_base.py:111] # cuda blocks: 1812, # CPU blocks: 2340
INFO 10-21 10:16:13 executor_base.py:116] Maximum concurrency for 8192 tokens per request: 3.54x
INFO 10-21 10:16:13 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utili

Capturing CUDA graph shapes: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:16<00:00,  2.11it/s]

INFO 10-21 10:16:30 model_runner.py:1562] Graph capturing finished in 17 secs, took 0.02 GiB
INFO 10-21 10:16:30 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 19.15 seconds



  state_dict = torch.load(model_path, map_location="cpu")


In [15]:
t1 = "Бишкек"
t2 = "Бишкек Бишкек"
t3 = "Бишкек Бишкек Бишкек"
t4 = "Бишкек Бишкек Бишкек Бишкек"

prompts = [t1, t2, t3, t4]

input_ids = tokenizer(prompts,
                           add_special_tokens=True,
                           padding=False,
                           truncation=False,
                           return_tensors=None)["input_ids"]

input_ids = [start_token + ids + end_tokens for ids in input_ids]

sampling_params = SamplingParams(
            n=1,  # num_return_sequences
            temperature=0.6,
            top_p=0.95,
            max_tokens=1200, #max_new_tokens
            stop_token_ids=[ST.EOS], #eos_token_id
            repetition_penalty=1.1,
            detokenize=False,
        )

generated_ids = vllm_model.generate(
            prompt_token_ids=input_ids,
            sampling_params=sampling_params,
        )

Processed prompts: 100%|███████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.15s/it, est. speed input: 13.96 toks/s, output: 127.57 toks/s]


In [29]:
sos_token = ST.SOS
eos_token = ST.EOS
codec_offset = 128266

In [34]:
def _redistribute_and_decode(normalized_tokens):
    """
    Helper function to demultiplex the flat list of codes into the
    three layers required by the SNAC vocoder and decode to audio.
    """
    layer_1, layer_2, layer_3 = [], [], []
    
    # The number of 7-token blocks
    num_blocks = len(normalized_tokens) // 7

    for i in range(num_blocks):
        base_idx = 7 * i
        layer_1.append(normalized_tokens[base_idx])
        layer_2.append(normalized_tokens[base_idx + 1] - 4096)
        layer_3.append(normalized_tokens[base_idx + 2] - (2 * 4096))
        layer_3.append(normalized_tokens[base_idx + 3] - (3 * 4096))
        layer_2.append(normalized_tokens[base_idx + 4] - (4 * 4096))
        layer_3.append(normalized_tokens[base_idx + 5] - (5 * 4096))
        layer_3.append(normalized_tokens[base_idx + 6] - (6 * 4096))
        
    # Convert the Python lists to the required tensor format for the vocoder
    codes = [
        torch.tensor(layer_1).unsqueeze(0),
        torch.tensor(layer_2).unsqueeze(0),
        torch.tensor(layer_3).unsqueeze(0)
    ]
    
    with torch.no_grad(): # Good practice when running inference
        audio_hat = snac_model.decode(codes)
    return audio_hat

In [39]:
batch_audio = []
for output in generated_ids:
    # The generated tokens are a tuple in output.outputs[0].token_ids
    generated_tokens = output.outputs[0].token_ids

    # 1. Find the last occurrence of the SOS token and slice after it.
    # This separates the generated audio codes from any prefix/prompt tokens.
    try:
        # Find the index of the last SOS token by searching the reversed tuple
        last_sos_idx = len(generated_tokens) - 1 - generated_tokens[::-1].index(sos_token)
        cropped_tokens = generated_tokens[last_sos_idx + 1:]
    except ValueError:
        # If no SOS token is found, use the entire sequence
        cropped_tokens = generated_tokens

    # 2. Filter out all EOS tokens using a list comprehension.
    filtered_tokens = [token for token in cropped_tokens if token != eos_token]

    # 3. Trim the sequence to the nearest multiple of 7.
    # The codec expects a flat list of codes in groups of 7.
    num_blocks = len(filtered_tokens) // 7
    if num_blocks == 0:
        # If there are not enough tokens to form a single block, skip.
        # You might want to return a silent tensor or handle this differently.
        batch_audio.append(torch.zeros((1, 0))) # Example: empty audio
        continue
    
    trimmed_length = num_blocks * 7
    trimmed_tokens = filtered_tokens[:trimmed_length]

    # 4. Normalize the tokens by subtracting the offset.
    # This is also done efficiently with a list comprehension.
    normalized_tokens = [t - codec_offset for t in trimmed_tokens]

    # 5. Redistribute the flat list into layers and decode to audio.
    audio_tensor = _redistribute_and_decode(normalized_tokens)
    batch_audio.append(audio_tensor)

In [41]:
#@title Display Audio
from IPython.display import display, Audio

for i in range(len(batch_audio)):
    print(prompts[i])
    samples = batch_audio[i]
    display(Audio(samples.detach().squeeze().to("cpu").numpy(), rate=24000))

Бишкек


Бишкек Бишкек


Бишкек Бишкек Бишкек


Бишкек Бишкек Бишкек Бишкек


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from orpheus_vllm import OrpheusOfflineModel

In [3]:
model_path = "/mnt/d/OrpheusTTS-checkpoints/merged_checkpoint"  # Update this path
model = OrpheusOfflineModel(
    model_path=model_path,
    dtype=torch.bfloat16,
    tokenizer='canopylabs/orpheus-3b-0.1-ft',
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Text to convert to speech
text = "Бишкек Кыргызстандын борбору жана эң чоң шаары болуп саналат."

INFO 10-21 11:11:53 __init__.py:207] Automatically detected platform cuda.


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 10-21 11:11:58 config.py:549] This model supports multiple tasks: {'classify', 'reward', 'embed', 'score', 'generate'}. Defaulting to 'generate'.
INFO 10-21 11:11:58 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/mnt/d/OrpheusTTS-checkpoints/merged_checkpoint', speculative_config=None, tokenizer='/mnt/d/OrpheusTTS-checkpoints/merged_checkpoint', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_m

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 10-21 11:12:27 model_runner.py:1115] Loading model weights took 6.1801 GB
INFO 10-21 11:12:29 worker.py:267] Memory profiling takes 1.48 seconds
INFO 10-21 11:12:29 worker.py:267] the current vLLM instance can use total_gpu_memory (11.99GiB) x gpu_memory_utilization (0.90) = 10.79GiB
INFO 10-21 11:12:29 worker.py:267] model weights take 6.18GiB; non_torch_memory takes 0.04GiB; PyTorch activation peak memory takes 1.48GiB; the rest of the memory reserved for KV Cache is 3.10GiB.
INFO 10-21 11:12:29 executor_base.py:111] # cuda blocks: 1812, # CPU blocks: 2340
INFO 10-21 11:12:29 executor_base.py:116] Maximum concurrency for 8192 tokens per request: 3.54x
INFO 10-21 11:12:30 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utili

Capturing CUDA graph shapes: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:14<00:00,  2.34it/s]

INFO 10-21 11:12:45 model_runner.py:1562] Graph capturing finished in 15 secs, took -0.03 GiB
INFO 10-21 11:12:45 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 17.33 seconds



  state_dict = torch.load(model_path, map_location="cpu")


In [4]:
ids = model.prepare_prompts(text)
generated_ids = model.generate(ids)
batch_audio = model.parse_output_as_speech(generated_ids)

  generated_ids = model.generate(ids)
Processed prompts: 100%|███████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.61s/it, est. speed input: 29.74 toks/s, output: 197.93 toks/s]


In [None]:
batch_audio

In [5]:
from IPython.display import display, Audio

for i in range(len(batch_audio)):
    samples = batch_audio[i]
    display(Audio(samples.detach().squeeze().to("cpu").numpy(), rate=24000))