In [2]:
# Cell 1 — Install dependencies and import libraries

!pip install -q snac ipywebrtc transformers accelerate

from snac import SNAC
import torch
import numpy as np
from IPython.display import display, Audio

from transformers import AutoModelForCausalLM, AutoTokenizer


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/260.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m256.0/260.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.7/260.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Cell 2 — Hugging Face login & model loading

from huggingface_hub import login

# replace
login(token="")

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Orpheus TTS model (LLaMA-based)
model_name = "canopylabs/orpheus-3b-0.1-ft"
print("*** Loading Orpheus model:", model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=None
).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Qwen decoder model
print("*** Loading Qwen decoder...")
qwen_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-0.6B",
    torch_dtype=torch.bfloat16,
    device_map=None
).to(device)

# SNAC codec for audio decode
print("*** Loading SNAC codec...")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
snac_device = device  # you can set "cpu" here if GPU RAM is tight
snac_model = snac_model.to(snac_device)

print("Orpheus hidden size:", model.config.hidden_size)
print("Qwen hidden size:", qwen_model.config.hidden_size)


Using device: cuda
*** Loading Orpheus model: canopylabs/orpheus-3b-0.1-ft


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/898 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.32G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/5.41M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/22.8M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

*** Loading Qwen decoder...


config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

*** Loading SNAC codec...


config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/79.5M [00:00<?, ?B/s]

Orpheus hidden size: 3072
Qwen hidden size: 1024


In [4]:
# Cell 3 Add Orpheus-style custom tokens to tokenizer

NUM_CUSTOM = 4096 * 7  # enough for all code streams

custom_tokens = [f"<custom_token_{i}>" for i in range(10, 10 + NUM_CUSTOM)]
added = tokenizer.add_tokens(custom_tokens)
print(f"Added {added} <custom_token_...> tokens")

# Resize embedding & LM head to match new vocab size
model.resize_token_embeddings(len(tokenizer))


Added 0 <custom_token_...> tokens


Embedding(156940, 3072)

In [5]:
# Cell 4 — Bridge modules and hybrid forward / generate

import torch.nn as nn
from tqdm import trange
ORP_HIDDEN = model.config.hidden_size      # 3072 for Orpheus 3B
QWEN_HIDDEN = qwen_model.config.hidden_size  # 1024 for Qwen 0.6B


class BridgeIn(nn.Module):
    def __init__(self):
        super().__init__()
        self.proj = nn.Linear(ORP_HIDDEN, QWEN_HIDDEN)

    def forward(self, x):
        return self.proj(x)


class BridgeOut(nn.Module):
    def __init__(self):
        super().__init__()
        self.proj = nn.Linear(QWEN_HIDDEN, ORP_HIDDEN)

    def forward(self, x):
        return self.proj(x)


bridge_in  = BridgeIn().to(device=device, dtype=torch.bfloat16)
bridge_out = BridgeOut().to(device=device, dtype=torch.bfloat16)

@torch.no_grad()
# def hybrid_forward(input_ids, attention_mask=None):
#     """
#     Orpheus embeddings → Bridge → Qwen → Bridge → Orpheus norm + LM head
#     Returns logits over Orpheus vocabulary.
#     """
#     # Step 1: Orpheus embeddings
#     x = model.model.embed_tokens(input_ids)       # [B, T, 3072]

#     # Step 2: 3072 → 1024 (BridgeIn)
#     x = bridge_in(x)                              # [B, T, 1024]

#     # Step 3: Qwen decoder over embeddings
#     qwen_out = qwen_model(
#         inputs_embeds=x,
#         attention_mask=attention_mask,
#         output_hidden_states=True
#     )

#     # Take last hidden state from Qwen
#     x = qwen_out.hidden_states[-1]                # [B, T, 1024]

#     # Step 4: 1024 → 3072 (BridgeOut)
#     x = bridge_out(x)                             # [B, T, 3072]

#     # Step 5: Orpheus final norm + LM head
#     x = model.model.norm(x)
#     logits = model.lm_head(x)                     # [B, T, vocab]

#     return logits
@torch.no_grad()
def hybrid_forward(input_ids, attention_mask=None, past_key_values=None, use_cache=True):
    # Step 1 — Orpheus embeddings
    x = model.model.embed_tokens(input_ids)

    # Step 2 — Bridge into Qwen space
    x = bridge_in(x)

    # Step 3 — Run Qwen with cache enabled
    qwen_out = qwen_model(
        inputs_embeds=x,
        attention_mask=attention_mask,
        past_key_values=past_key_values,
        use_cache=use_cache,
        output_hidden_states=True
    )

    # Step 4 — Take last hidden state safely
    x = qwen_out.hidden_states[-1]

    # Step 5 — Bridge back
    x = bridge_out(x)

    # Step 6 — Orpheus norm + head
    x = model.model.norm(x)
    logits = model.lm_head(x)

    return logits, qwen_out.past_key_values


@torch.no_grad()
def hybrid_generate(
    input_ids,
    attention_mask,
    max_new_tokens=200,
    temperature=1.0,
    eos_token_id=128258,
    **kwargs
):
    generated = input_ids
    past_kv = None

    for _ in trange(max_new_tokens, desc="Hybrid generation"):
        logits, past_kv = hybrid_forward(
            generated[:, -1:],
            attention_mask,
            past_key_values=past_kv,
            use_cache=True
        )

        next_token_logits = logits[:, -1, :]

        probs = torch.softmax(next_token_logits / temperature, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)

        generated = torch.cat([generated, next_token], dim=1)

        if (next_token == eos_token_id).all():
            break

    return generated



# def hybrid_generate(input_ids, attention_mask, max_new_tokens=200, temperature=0.7, top_p=0.95):
#     """
#     Simple autoregressive loop using hybrid_forward.
#     """
#     generated = input_ids.clone()
#     attn_mask = attention_mask.clone()

#     for _ in range(max_new_tokens):
#         logits = hybrid_forward(generated, attn_mask)

#         # logits for last time step
#         next_token_logits = logits[:, -1, :]

#         # sample next token
#         probs = torch.softmax(next_token_logits / temperature, dim=-1)
#         next_token = torch.multinomial(probs, num_samples=1)  # [B, 1]

#         # append token and extend attention mask
#         generated = torch.cat([generated, next_token], dim=1)
#         new_mask = torch.ones((attn_mask.shape[0], 1), dtype=attn_mask.dtype, device=attn_mask.device)
#         attn_mask = torch.cat([attn_mask, new_mask], dim=1)

#     return generated


In [6]:
# Cell — Hybrid forward + KV-cached generation (UPDATED)

import torch.nn as nn
from tqdm import trange

ORP_HIDDEN = model.config.hidden_size
QWEN_HIDDEN = qwen_model.config.hidden_size


class BridgeIn(nn.Module):
    def __init__(self):
        super().__init__()
        self.proj = nn.Linear(ORP_HIDDEN, QWEN_HIDDEN)

    def forward(self, x):
        return self.proj(x)


class BridgeOut(nn.Module):
    def __init__(self):
        super().__init__()
        self.proj = nn.Linear(QWEN_HIDDEN, ORP_HIDDEN)

    def forward(self, x):
        return self.proj(x)


bridge_in  = BridgeIn().to(device=device, dtype=torch.bfloat16)
bridge_out = BridgeOut().to(device=device, dtype=torch.bfloat16)


@torch.no_grad()
def hybrid_forward(input_ids, attention_mask=None, past_key_values=None, use_cache=True):
    # Orpheus embeddings
    x = model.model.embed_tokens(input_ids)

    # Bridge into Qwen space
    x = bridge_in(x)

    # Run Qwen with KV cache
    qwen_out = qwen_model(
        inputs_embeds=x,
        attention_mask=attention_mask,
        past_key_values=past_key_values,
        use_cache=use_cache,
        output_hidden_states=True
    )

    # Last hidden state
    x = qwen_out.hidden_states[-1]

    # Bridge back
    x = bridge_out(x)

    # Orpheus norm + LM head
    x = model.model.norm(x)
    logits = model.lm_head(x)

    return logits, qwen_out.past_key_values


@torch.no_grad()
def hybrid_generate(
    input_ids,
    attention_mask,
    max_new_tokens=200,
    temperature=0.7,
    top_p=0.95,
    eos_token_id=128258
):
    generated = input_ids
    past_kv = None

    for _ in trange(max_new_tokens, desc="Hybrid generation"):
        # Take only last token
        input_step = generated[:, -1:]

        # Create minimal attention mask for KV mode
        step_mask = torch.ones(
            (input_step.shape[0], 1),
            device=input_step.device,
            dtype=attention_mask.dtype
        )

        logits, past_kv = hybrid_forward(
            input_step,
            step_mask,
            past_key_values=past_kv,
            use_cache=True
        )

        # Sample next token
        next_token_logits = logits[:, -1, :]
        probs = torch.softmax(next_token_logits / temperature, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)

        # Append token
        generated = torch.cat([generated, next_token], dim=1)

        # Stop on EOS
        if (next_token == eos_token_id).all():
            break

    return generated


In [7]:
# Cell 5 — Prepare Orpheus prompts and generate tokens via hybrid model

chosen_voice = "tara"

prompts = [
    "Hey there my name is Tara, <chuckle> and I'm a speech generation model that can sound like a person.",
    "I've also been taught to understand and produce paralinguistic things like sighing, or chuckling, or yawning!",
    "I live in San Francisco, and have, uhm let's see, 3 billion 7 hundred ... well, lets just say a lot of parameters.",
]

prompts = [f"{chosen_voice}: " + p for p in prompts]

all_input_ids = []

for prompt in prompts:
    ids = tokenizer(prompt, return_tensors="pt").input_ids
    all_input_ids.append(ids)

start_token = torch.tensor([[128259]], dtype=torch.int64)            # SOH
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)     # EOT, EOH

all_modified = []
for ids in all_input_ids:
    modified = torch.cat([start_token, ids, end_tokens], dim=1)
    all_modified.append(modified)

# Pad to max length
max_len = max(m.shape[1] for m in all_modified)

all_padded = []
all_masks = []

for m in all_modified:
    pad_len = max_len - m.shape[1]
    pad_tokens = torch.full((1, pad_len), 128263, dtype=torch.int64)  # PAD
    pad_mask = torch.zeros((1, pad_len), dtype=torch.int64)

    ids = torch.cat([pad_tokens, m], dim=1)
    mask = torch.cat([pad_mask, torch.ones((1, m.shape[1]), dtype=torch.int64)], dim=1)

    all_padded.append(ids)
    all_masks.append(mask)

input_ids = torch.cat(all_padded, dim=0).to(device)
attention_mask = torch.cat(all_masks, dim=0).to(device)

print("Input IDs shape:", input_ids.shape)
print("Attention mask shape:", attention_mask.shape)

print("*** Generating with hybrid model (Orpheus + Qwen)...")

with torch.no_grad():
    generated_ids = hybrid_generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=200,
        temperature=0.7,
        top_p=0.95,
    )

print("Generated IDs shape:", generated_ids.shape)


Input IDs shape: torch.Size([3, 39])
Attention mask shape: torch.Size([3, 39])
*** Generating with hybrid model (Orpheus + Qwen)...


Hybrid generation: 100%|██████████| 200/200 [00:18<00:00, 10.98it/s]

Generated IDs shape: torch.Size([3, 239])





In [8]:
# Cell 6 — Map generated tokens to SNAC-like integer codes (gibberish, but valid)

@torch.no_grad()
def tokens_to_snac_multiframe(row_ids):
    """
    Convert a 1D tensor of token IDs into a list of ints in [0, 4095],
    truncated to a multiple of 7 so the Orpheus SNAC framing logic works.
    """
    row_ids = row_ids.detach().cpu().flatten()
    if row_ids.shape[0] < 7:
        return None

    # Make length multiple of 7 (Orpheus packs 7 tokens per 'frame')
    new_len = (row_ids.shape[0] // 7) * 7
    row_ids = row_ids[:new_len]

    # Map arbitrary token IDs into [0, 4095]
    multiframe = [(int(t) % 4096) for t in row_ids]

    return multiframe

print("Converting generated IDs to SNAC-like code streams...")

all_multiframes = []
for i in range(generated_ids.shape[0]):
    mf = tokens_to_snac_multiframe(generated_ids[i])
    if mf is not None:
        all_multiframes.append(mf)

print(f"Built {len(all_multiframes)} SNAC-style multiframe sequences.")


Converting generated IDs to SNAC-like code streams...
Built 3 SNAC-style multiframe sequences.


In [9]:
# Cell 7 — Rebuild SNAC code streams (codes_0, codes_1, codes_2) and decode to audio

@torch.no_grad()
def multiframe_to_audio(multiframe):
    """
    Re-implements Orpheus-style 7-token framing into 3 SNAC code streams.
    Will produce noise audio, but valid, suitable for latency tests.
    """
    if len(multiframe) < 7:
        return None

    num_frames = len(multiframe) // 7
    frame = multiframe[:num_frames * 7]

    codes_0 = torch.empty(0, device=snac_device, dtype=torch.int32)
    codes_1 = torch.empty(0, device=snac_device, dtype=torch.int32)
    codes_2 = torch.empty(0, device=snac_device, dtype=torch.int32)

    for j in range(num_frames):
        i = 7 * j

        # layer 0: position 0
        codes_0 = torch.cat([codes_0, torch.tensor([frame[i]], device=snac_device, dtype=torch.int32)])

        # layer 1: positions 1 and 4
        codes_1 = torch.cat([
            codes_1,
            torch.tensor([frame[i+1]], device=snac_device, dtype=torch.int32),
            torch.tensor([frame[i+4]], device=snac_device, dtype=torch.int32),
        ])

        # layer 2: positions 2,3,5,6
        codes_2 = torch.cat([
            codes_2,
            torch.tensor([frame[i+2]], device=snac_device, dtype=torch.int32),
            torch.tensor([frame[i+3]], device=snac_device, dtype=torch.int32),
            torch.tensor([frame[i+5]], device=snac_device, dtype=torch.int32),
            torch.tensor([frame[i+6]], device=snac_device, dtype=torch.int32),
        ])

    codes = [
        codes_0.unsqueeze(0),
        codes_1.unsqueeze(0),
        codes_2.unsqueeze(0),
    ]

    # Extra safety: ensure codes are in [0, 4095]
    for c in codes:
        if torch.any(c < 0) or torch.any(c > 4095):
            return None

    audio_hat = snac_model.decode(codes)  # [1, T] or [1, 1, T]
    return audio_hat


print("Decoding SNAC gibberish audio from hybrid tokens...")

my_samples = []
for i, mf in enumerate(all_multiframes):
    audio = multiframe_to_audio(mf)
    if audio is not None:
        my_samples.append(audio)

print(f"Got {len(my_samples)} audio samples from {len(all_multiframes)} sequences.")


Decoding SNAC gibberish audio from hybrid tokens...
Got 3 audio samples from 3 sequences.


In [10]:
# Cell 8 — Play resulting audio (noise / gibberish, but valid waveform)

for i, audio in enumerate(my_samples):
    print(f"\nPrompt {i}: {prompts[i]}")
    waveform = audio.squeeze().detach().cpu().numpy()
    display(Audio(waveform, rate=24000))



Prompt 0: tara: Hey there my name is Tara, <chuckle> and I'm a speech generation model that can sound like a person.



Prompt 1: tara: I've also been taught to understand and produce paralinguistic things like sighing, or chuckling, or yawning!



Prompt 2: tara: I live in San Francisco, and have, uhm let's see, 3 billion 7 hundred ... well, lets just say a lot of parameters.


In [17]:
import time
import torch

def measure_inference(fn, warmup=5, runs=20):
    # Warmup (GPU kernels + cache)
    for _ in range(warmup):
        _ = fn()
        torch.cuda.synchronize()

    times = []

    for _ in range(runs):
        torch.cuda.synchronize()
        start = time.time()

        _ = fn()

        torch.cuda.synchronize()
        end = time.time()
        times.append(end - start)

    return sum(times) / len(times), times


In [14]:
!pip install tqdm




In [30]:
def run_hybrid():
    return hybrid_generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=200
    )

avg_time, logs = measure_inference(run_hybrid)

print(f"Hybrid model avg inference time: {avg_time:.4f} seconds")


Hybrid generation: 100%|██████████| 200/200 [00:19<00:00, 10.14it/s]
Hybrid generation: 100%|██████████| 200/200 [00:20<00:00,  9.59it/s]
Hybrid generation: 100%|██████████| 200/200 [00:20<00:00,  9.98it/s]
Hybrid generation: 100%|██████████| 200/200 [00:18<00:00, 10.58it/s]
Hybrid generation: 100%|██████████| 200/200 [00:18<00:00, 10.89it/s]
Hybrid generation: 100%|██████████| 200/200 [00:20<00:00,  9.95it/s]
Hybrid generation: 100%|██████████| 200/200 [00:18<00:00, 10.73it/s]
Hybrid generation: 100%|██████████| 200/200 [00:21<00:00,  9.48it/s]
Hybrid generation: 100%|██████████| 200/200 [00:18<00:00, 10.87it/s]
Hybrid generation: 100%|██████████| 200/200 [00:19<00:00, 10.36it/s]
Hybrid generation: 100%|██████████| 200/200 [00:19<00:00, 10.51it/s]
Hybrid generation: 100%|██████████| 200/200 [00:24<00:00,  8.14it/s]
Hybrid generation: 100%|██████████| 200/200 [00:20<00:00,  9.96it/s]
Hybrid generation: 100%|██████████| 200/200 [00:20<00:00,  9.86it/s]
Hybrid generation: 100%|██████████

Hybrid model avg inference time: 21.1933 seconds





In [15]:
def run_orpheus():
    return model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7
    )

avg_time_orpheus, _ = measure_inference(run_orpheus)

print(f"Original Orpheus avg time: {avg_time_orpheus:.4f} seconds")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [18]:
import time
import torch

def run_full_pipeline():
    with torch.no_grad():
        # Step 1 — Generate tokens
        gen_ids = hybrid_generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=200
        )

        # Step 2 — Convert tokens to SNAC multiframes
        all_multiframes = []
        for i in range(gen_ids.shape[0]):
            mf = tokens_to_snac_multiframe(gen_ids[i])
            if mf is not None:
                all_multiframes.append(mf)

        # Step 3 — Decode to audio
        audios = []
        for mf in all_multiframes:
            audio = multiframe_to_audio(mf)
            if audio is not None:
                audios.append(audio)

    return audios


In [19]:
avg_time, logs = measure_inference(run_full_pipeline)

print(f"Full pipeline (LLM + SNAC) avg time: {avg_time:.4f} seconds")


Hybrid generation: 100%|██████████| 200/200 [00:19<00:00, 10.24it/s]
Hybrid generation: 100%|██████████| 200/200 [00:27<00:00,  7.20it/s]
Hybrid generation: 100%|██████████| 200/200 [00:24<00:00,  8.25it/s]
Hybrid generation: 100%|██████████| 200/200 [00:17<00:00, 11.25it/s]
Hybrid generation: 100%|██████████| 200/200 [00:17<00:00, 11.38it/s]
Hybrid generation: 100%|██████████| 200/200 [00:17<00:00, 11.62it/s]
Hybrid generation: 100%|██████████| 200/200 [00:17<00:00, 11.45it/s]
Hybrid generation: 100%|██████████| 200/200 [00:19<00:00, 10.04it/s]
Hybrid generation: 100%|██████████| 200/200 [00:17<00:00, 11.37it/s]
Hybrid generation: 100%|██████████| 200/200 [00:21<00:00,  9.12it/s]
Hybrid generation: 100%|██████████| 200/200 [00:17<00:00, 11.45it/s]
Hybrid generation: 100%|██████████| 200/200 [00:17<00:00, 11.43it/s]
Hybrid generation: 100%|██████████| 200/200 [00:17<00:00, 11.15it/s]
Hybrid generation: 100%|██████████| 200/200 [00:17<00:00, 11.52it/s]
Hybrid generation: 100%|██████████

Full pipeline (LLM + SNAC) avg time: 17.9203 seconds


In [20]:
def run_full_orpheus():
    with torch.no_grad():
        gen_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7
        )

        all_multiframes = []
        for i in range(gen_ids.shape[0]):
            mf = tokens_to_snac_multiframe(gen_ids[i])
            if mf is not None:
                all_multiframes.append(mf)

        audios = []
        for mf in all_multiframes:
            audio = multiframe_to_audio(mf)
            if audio is not None:
                audios.append(audio)

    return audios


In [22]:
avg_time_orp, _ = measure_inference(run_full_orpheus)

print(f"Original Orpheus full pipeline avg time: {avg_time_orp:.4f} seconds")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Original Orpheus full pipeline avg time: 41.4967 seconds
