In [None]:
# pip3 install torch transformers librosa accelerate 

In [1]:
# clear GPU cache before importing large libraries
import torch
if torch.cuda.is_available():
    torch.cuda.empty_cache()


In [2]:
import librosa
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
import os
import pandas as pd

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")

PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA device: NVIDIA GeForce RTX 3060


In [4]:
repo_id = "MERaLiON/MERaLiON-2-3B"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

#  clear GPU cache before loading model
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("Loading processor...")
processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)

print("Loading model...")
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    repo_id,
    use_safetensors=True,
    trust_remote_code=True,
    attn_implementation="eager",   # <--- force eager attention
    torch_dtype=torch.bfloat16 if device=="cuda" else torch.float32,
).to(device)

print("Model loaded successfully!")

Using device: cuda
Loading processor...
Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully!


In [5]:
prompt_template = (
    "Instruction: Please transcribe this speech. \n"
    "Follow the text instruction based on the following audio: <SpeechHere>"
)
conversation = [[{"role": "user", "content": prompt_template}]]
chat_prompt = processor.tokenizer.apply_chat_template(
    conversation=conversation, tokenize=False, add_generation_prompt=True
)


In [6]:
def process_audio_file(audio_path, queries=None):
    """
    Process audio file with given queries (memory-efficient version)
    
    Args:
        audio_path: Path to your audio file
        queries: List of query strings (defaults to transcribe and translate)
    """
    
    # Check if file exists
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"Audio file not found: {audio_path}")
    
    print(f"Loading audio: {audio_path}")
    
    # Load audio at 16kHz
    audio_array, sample_rate = librosa.load(audio_path, sr=16000)
    print(f"Audio loaded: {len(audio_array)/sample_rate:.2f} seconds")

    # only process first 30 seconds
    audio_array = audio_array[:30 * sample_rate]
    print(f"Processing first {len(audio_array)/sample_rate:.2f} seconds of audio")
    
    inputs = processor(text=chat_prompt, audios=[audio_array])
    for k, v in list(inputs.items()):
        if isinstance(v, torch.Tensor):
            v = v.to(device)
            if device=="cuda" and v.dtype==torch.float32:
                v = v.to(torch.bfloat16)
            inputs[k] = v

    # --- Generate transcription ---
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=256)

    generated_ids = outputs[:, inputs["input_ids"].size(1):]
    transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    # clear GPU cache after processing
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    print("--------------------------------------------")
    return transcript

# Getting transcription

In [7]:
AUDIO_FILE_PATH = "data/fairness/audio" # path to fairness audio files

try:
    # Process the audio files
    results = []
    audios = []
    for audio_file in os.listdir(AUDIO_FILE_PATH):
        if audio_file.endswith(".wav") or audio_file.endswith(".mp3"):
            full_path = os.path.join(AUDIO_FILE_PATH, audio_file)
            audios.append(audio_file)
            print(f"Processing file: {full_path}")
    
            results.append(process_audio_file(full_path))
            
    # save results in an excel file with columns filename, transcript
    df = pd.DataFrame({"Filename": audios, "Transcription": results})
    df.to_excel("results/fairness/transcription_results.xlsx", index=False)

    print("Transcription results saved to results/fairness/transcription_results.xlsx")

except Exception as e:
    print(f"An error occurred: {e}")


Processing file: data/fairness/audio\test1.mp3
Loading audio: data/fairness/audio\test1.mp3
Audio loaded: 96.67 seconds
Processing first 30.00 seconds of audio


  normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
  ret = ret.dtype.type(ret / rcount)
  normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


--------------------------------------------
Processing file: data/fairness/audio\test10.mp3
Loading audio: data/fairness/audio\test10.mp3
Audio loaded: 102.47 seconds
Processing first 30.00 seconds of audio
--------------------------------------------
Processing file: data/fairness/audio\test2.mp3
Loading audio: data/fairness/audio\test2.mp3
Audio loaded: 103.86 seconds
Processing first 30.00 seconds of audio
--------------------------------------------
Processing file: data/fairness/audio\test21.mp3
Loading audio: data/fairness/audio\test21.mp3
Audio loaded: 53.01 seconds
Processing first 30.00 seconds of audio
--------------------------------------------
Processing file: data/fairness/audio\test22.mp3
Loading audio: data/fairness/audio\test22.mp3
Audio loaded: 22.99 seconds
Processing first 22.99 seconds of audio
--------------------------------------------
Processing file: data/fairness/audio\test23.mp3
Loading audio: data/fairness/audio\test23.mp3
Audio loaded: 37.72 seconds
Proce

In [None]:
process_audio_file(f'{AUDIO_FILE_PATH}/test28.mp3') # for specific file testing

Loading audio: audio/test28.mp3
Audio loaded: 1561.24 seconds
Processing first 30.00 seconds of audio
--------------------------------------------


"<Speaker1>: Singapore president Halimah Yacob is just about to address the country's parliament as it reopens after a recess of a little more than two weeks. The recess, which happens when parliament is prorogued, typically marks the midpoint of the government's current term. Madam Halimah is expected to outline the government's priorities, policies and programs ahead of the remainder of its term. Her speech comes as the nation rebounds from the impact of Covid Nineteen, but with the spectre of geopolitical tensions.\n"

# Result Evaluation

In [13]:
import pandas as pd

# read the transcription results
df = pd.read_excel("results/fairness/transcription_results.xlsx")
df.head()

Unnamed: 0,Filename,Transcription
0,test1.mp3,<Speaker1>: It's my final day here in Switzerl...
1,test10.mp3,<Speaker1>: I'm in Paris and I want to do some...
2,test2.mp3,<Speaker1>: I did it. I got the iPhone Air. Lo...
3,test21.mp3,"<Speaker1>: Brian, look at me. This is not you..."
4,test22.mp3,<Speaker1>: I designed this yoga bag and it wa...


In [34]:
metadata = pd.read_excel("results/fairness/metadata.xlsx")

In [35]:
metadata.head()


Unnamed: 0,Filename,meralion,Added by,Video,Gender,Race,Age,GT,Accuracy
0,test1.mp3,<Speaker1>: It's my final day here in Switzerl...,yy,https://www.instagram.com/reel/DO7kMR-Exnm/?ut...,F,Chinese,31,<Speaker1>: It's my final day here in Switzerl...,
1,test10.mp3,<Speaker1>: I'm in Paris and I want to do some...,yy,https://www.tiktok.com/@zakiv4/video/754969856...,M,Malay,30,<Speaker1>: I'm in Paris and I want to do some...,
2,test2.mp3,<Speaker1>: I did it. I got the iPhone Air. Lo...,yy,https://www.tiktok.com/@thejianhaotan/video/75...,M,Chinese,32,<Speaker1>: I did it. I got the iPhone Air. Lo...,
3,test21.mp3,"<Speaker1>: Brian, look at me. This is not you...",yun si,https://www.tiktok.com/@syapls/video/752244596...,F,Malay,28,"<Speaker1>: Brian, look at me. This is not you...",
4,test22.mp3,<Speaker1>: I designed this yoga bag and it wa...,yun si,https://www.instagram.com/p/DA8ZO6Wy-Kx/,F,Chinese,34,<Speaker1>: I designed this yoga bag and it wa...,


In [None]:
# calc WER & CER base on GT & meralion output
!pip install jiwer

from typing import Dict
import numpy as np
from jiwer import wer, cer

def compute_metrics(reference: str, hypothesis: str) -> Dict[str, float]:
    return {
        "wer": wer(reference, hypothesis),
        "cer": cer(reference, hypothesis),
    }
metrics = []
for index, row in df.iterrows():
    filename = row["Filename"]
    hypothesis = row["Transcription"]
    reference_row = metadata[metadata["Filename"] == filename]
    if not reference_row.empty:
        reference = reference_row.iloc[0]["GT"]
        metrics.append(compute_metrics(reference, hypothesis))
    else:
        print(f"Warning: No reference found for {filename}")
metrics_df = pd.DataFrame(metrics)
metrics_df["Filename"] = df["Filename"]
metrics_df["WER"] = metrics_df["wer"]
metrics_df["CER"] = metrics_df["cer"]