In [None]:
# --- Install ---
!pip install "transformers==4.50.1" librosa
# (optional, GPU speedup)
!pip install flash-attn --no-build-isolation

# --- Imports ---
import torch, librosa
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor

Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.8.3-cp312-cp312-linux_x86_64.whl size=256040057 sha256=f25da18657a87fc83dc1bfb8b7751b82246e9db355510226b674fd437c34b5fb
  Stored in directory: /root/.cache/pip/wheels/3d/59/46/f282c12c73dd4bb3c2e3fe199f1a0d0f8cec06df0cccfeee27
Successfully built flash-attn
Installing collected packages: flash-attn
Successfully installed flash-attn-2.8.3


In [None]:
REPO_ID = "MERaLiON/MERaLiON-2-3B"
SAMPLE_RATE = 16000

# --- Load model + processor ---
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained(REPO_ID, trust_remote_code=True)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    REPO_ID,
    use_safetensors=True,
    trust_remote_code=True,
    attn_implementation="eager",   # <--- force eager attention
    torch_dtype=torch.bfloat16 if device=="cuda" else torch.float32,
).to(device)

# --- Prompt (as per model card) ---
prompt_template = (
    "Instruction: Please transcribe this speech. \n"
    "Follow the text instruction based on the following audio: <SpeechHere>"
)
conversation = [[{"role": "user", "content": prompt_template}]]
chat_prompt = processor.tokenizer.apply_chat_template(
    conversation=conversation, tokenize=False, add_generation_prompt=True
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

TRANSCRIPT:
 <Speaker1>: let me tell you something. Recently people try to cancel me for not showering in the morning. <Speaker2>: (oh) ya. <Speaker1>: What do you think the world has come to? That's why everybody gets cancelled everyday. That's why I don't really care anymore. No shower in the morning also can get cancelled. Might as well just cancel me. <Speaker2>: ya, but to be fair, if you get cancelled tomorrow, <Speaker1>: you know, <Speaker2>: was it really a cancellation? <Speaker1>: true. <Speaker2>: No, her reply. <Speaker1>: That's why I feel like all my cancellations are stupid things. You know, like you look at other influencers. (wah) they get cancelled for embarrassment going to jail. <Speaker2>: Ya. <Speaker1>: Let me cancel because I never shower in the morning. I'm really a joke. Whatever that's why I just tell myself. You know what? This is my fate. I accept it and I move on. <Speaker2>: But each time this type of thing happen, you actually grow a lot, right, like fo

Below code will generate the transcript using Meralion 3B as a baseline.
You can re-listen to the audio by yourself, and correct the transcript accordingly, before updating the evaluation suite.

In [None]:
AUDIO_PATH = "/content/test2.mp3"  # <-- change your file here
waveform, sr = librosa.load(AUDIO_PATH, sr=SAMPLE_RATE, mono=True)

# --- Prepare inputs ---
inputs = processor(text=chat_prompt, audios=[waveform])
for k, v in list(inputs.items()):
    if isinstance(v, torch.Tensor):
        v = v.to(device)
        if device=="cuda" and v.dtype==torch.float32:
            v = v.to(torch.bfloat16)
        inputs[k] = v

# --- Generate transcription ---
with torch.inference_mode():
    outputs = model.generate(**inputs, max_new_tokens=256)

generated_ids = outputs[:, inputs["input_ids"].size(1):]
transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("TRANSCRIPT:\n", transcript)

TRANSCRIPT:
 <Speaker1>: (hm), are you here alone? <Speaker2>: (um), yes. <Speaker1>: Hi. I'm I'm a ray. Can you can you sit down? <Speaker2>: No. <Speaker1>: Don't offer me a seat. That's very rude. <Speaker2>: (huh), <Speaker1>: Okay, I'm I've been thinking you're quite cute. <Speaker2>: Oh, is it? <Speaker1>: Yeah, you want to go? <Speaker2>: Go where? <Speaker1>: Blk. Pasar. <Speaker2>: Huh? For the first date? <Speaker1>: Yes. <Speaker2>: Are you paying? <Speaker1>: Of course, fifty fifty. <Speaker2>: I thought you had money. <Speaker1>: No, no money. No money. Economy is bad. We cannot cannot expect stuff like that, but actually I'm doing my own business, right?

