## Converting HuggingFace Models to ONNX

In [1]:
import os
from pathlib import Path
from optimum.exporters.onnx import main_export

In [6]:
from transformers import WhisperConfig
from optimum.exporters.onnx import main_export
from optimum.exporters.onnx.model_configs import WhisperOnnxConfig

# Specify the model name or path
model_id = "openai/whisper-large-v3"

# Load the Whisper config from Hugging Face
whisper_config = WhisperConfig.from_pretrained(model_id, from_flax=True)

# Define the ONNX export configuration using WhisperOnnxConfig
onnx_config = WhisperOnnxConfig(whisper_config)

# Set the export directory and filename for the ONNX model
onnx_model_path = "onnx/whisper-large-v3.onnx"

# Export the model to ONNX using Optimum
main_export(model_id,   # Pass the correct argument here
            config=onnx_config, 
            output=onnx_model_path,
            task="automatic-speech-recognition")


  if input_features.shape[-1] != expected_seq_length:
  if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
  if sequence_length != 1:
Found different candidate ONNX initializers (likely duplicate) for the tied weights:
	model.decoder.embed_tokens.weight: {'model.decoder.embed_tokens.weight'}
	proj_out.weight: {'onnx::MatMul_8506'}
		-[x] values not close enough, max diff: 0.015755653381347656 (atol: 0.001)
- last_hidden_state: max diff = 0.015755653381347656.
 The exported model was saved at: onnx/whisper-large-v3.onnx


In [None]:
model_id = "openai/whisper-large-v3"

print("Exporting model as ONNX")

main_export(
    model_id,
    output="onnx/out",
    task="automatic-speech-recognition"
)

#config = WhisperConfig.from_pretrained(model_id)
#onnx_config = WhisperOnnxConfig(config, task="automatic-speech-recognition")

Exporting model as ONNX


  if input_features.shape[-1] != expected_seq_length:
  if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
  if sequence_length != 1:
Found different candidate ONNX initializers (likely duplicate) for the tied weights:
	model.decoder.embed_tokens.weight: {'model.decoder.embed_tokens.weight'}
	proj_out.weight: {'onnx::MatMul_8506'}
		-[x] values not close enough, max diff: 0.021198749542236328 (atol: 0.001)
- last_hidden_state: max diff = 0.021198749542236328.
 The exported model was saved at: onnx/out


## Quantize the model

In [5]:
import onnx
model = onnx.load("onnx/out/decoder_model.onnx")
for node in model.graph.node:
    print(node.name, node.op_type)

/model/decoder/Shape Shape
/model/decoder/Constant Constant
/model/decoder/Gather Gather
/model/decoder/Constant_1 Constant
/model/decoder/Shape_1 Shape
/model/decoder/Constant_2 Constant
/model/decoder/Gather_1 Gather
/model/decoder/Constant_3 Constant
Constant_1638 Constant
/model/decoder/Unsqueeze Unsqueeze
/model/decoder/Concat Concat
/model/decoder/Reshape Reshape
/model/decoder/embed_tokens/Gather Gather
/model/decoder/Constant_4 Constant
/model/decoder/Cast Cast
/model/decoder/Constant_5 Constant
/model/decoder/Range Range
/model/decoder/Constant_6 Constant
/model/decoder/Unsqueeze_1 Unsqueeze
Constant_1649 Constant
/model/decoder/Unsqueeze_2 Unsqueeze
/model/decoder/Constant_7 Constant
/model/decoder/Concat_1 Concat
Constant_1653 Constant
/model/decoder/Unsqueeze_3 Unsqueeze
/model/decoder/Constant_8 Constant
/model/decoder/Concat_2 Concat
/model/decoder/Shape_2 Shape
/model/decoder/ConstantOfShape ConstantOfShape
/model/decoder/Expand Expand
/model/decoder/Tile Tile
/model/dec

In [2]:
import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

def quantize_onnx_model_to_int8(input_onnx_model_path, output_quantized_model_path):
    """
    Quantize an ONNX model to INT8 precision using dynamic quantization.
    Args:
        input_onnx_model_path: Path to the original ONNX model
        output_quantized_model_path: Path to save the quantized model
    """
    print(f"Quantizing model to INT8: {input_onnx_model_path}")

    # Perform dynamic quantization to INT8
    quantize_dynamic(
        model_input=input_onnx_model_path,
        model_output=output_quantized_model_path,
        weight_type=QuantType.QInt8,  # Quantize weights to 8-bit integers
        per_channel=False,            # Use per-tensor quantization (more compatible)
        reduce_range=True,            # Improves accuracy for some hardware
        op_types_to_quantize=['MatMul', 'Conv', 'Gemm'],  # Operations to quantize
        nodes_to_exclude=['/conv1/Conv']
    )

    print(f"Quantized model saved to: {output_quantized_model_path}")

    # Verify model was quantized successfully
    model = onnx.load(output_quantized_model_path)
    onnx.checker.check_model(model)  # Will raise an exception if model is invalid
    print(f"Quantized model validated successfully.")

    return output_quantized_model_path

# Example usage for both encoder and decoder models:
encoder_model_path = "onnx/out/encoder_model.onnx"  # Specify encoder model file
decoder_model_path = "onnx/out/decoder_model.onnx"  # Specify decoder model file

output_encoder_model_path = "onnx/out/encoder_model_quantized.onnx"  # Quantized encoder model path
output_decoder_model_path = "onnx/out/decoder_model_quantized.onnx"  # Quantized decoder model path

# Quantizing encoder and decoder models
quantized_encoder_model = quantize_onnx_model_to_int8(encoder_model_path, output_encoder_model_path)
quantized_decoder_model = quantize_onnx_model_to_int8(decoder_model_path, output_decoder_model_path)

print(f"Quantized encoder model is saved at: {quantized_encoder_model}")
print(f"Quantized decoder model is saved at: {quantized_decoder_model}")




Quantizing model to INT8: onnx/out/encoder_model.onnx




Quantized model saved to: onnx/out/encoder_model_quantized.onnx
Quantized model validated successfully.
Quantizing model to INT8: onnx/out/decoder_model.onnx




Quantized model saved to: onnx/out/decoder_model_quantized.onnx
Quantized model validated successfully.
Quantized encoder model is saved at: onnx/out/encoder_model_quantized.onnx
Quantized decoder model is saved at: onnx/out/decoder_model_quantized.onnx


## Inference

In [1]:
import onnx
import onnxruntime as ort
import numpy as np
import librosa
from transformers import WhisperTokenizer
from datasets import load_dataset

# Load ONNX models
encoder_model_path = "onnx/out/encoder_model.onnx"  # Unquantized encoder model path
decoder_model_path = "onnx/out/decoder_model.onnx"  # Unquantized decoder model path

# Load the models into ONNX Runtime
encoder_session = ort.InferenceSession(encoder_model_path)
decoder_session = ort.InferenceSession(decoder_model_path)

# Load WhisperTokenizer for converting speech to token ids
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")

def preprocess_audio(audio_path, target_sr=16000, target_length=3000):
    """
    Preprocess the audio file to fit the input format expected by the Whisper model.
    Args:
        audio_path: Path to the audio file
        target_sr: Target sample rate for Whisper (default: 16000)
        target_length: Target length for the audio sequence (default: 3000)
    Returns:
        numpy array: Preprocessed audio
    """
    # Load the audio file
    audio, sr = librosa.load(audio_path, sr=None)
    
    # Resample to target sample rate
    if sr != target_sr:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
    
    # Normalize audio to range [-1, 1]
    audio = audio / np.max(np.abs(audio))

    # Ensure the audio length matches the model's expected length
    if len(audio) < target_length:
        # If the audio is shorter, pad with zeros
        audio = np.pad(audio, (0, target_length - len(audio)))
    else:
        # If the audio is longer, truncate it
        audio = audio[:target_length]

    return audio

def encode_audio(audio, encoder_session):
    """
    Encode the audio using the encoder model.
    Args:
        audio: Preprocessed audio
        encoder_session: ONNX Runtime session for the encoder model
    Returns:
        numpy array: Encoder output (feature embeddings)
    """
    # Prepare audio input (reshape as required by the model)
    input_audio = np.expand_dims(audio, axis=0)  # Add batch dimension
    input_audio = np.expand_dims(input_audio, axis=0)  # Add channel dimension
    
    # Run inference on the encoder
    encoder_input_name = encoder_session.get_inputs()[0].name
    encoder_output_name = encoder_session.get_outputs()[0].name
    encoder_output = encoder_session.run([encoder_output_name], {encoder_input_name: input_audio})
    
    return encoder_output[0]

def decode_output(encoder_output, decoder_session):
    """
    Decode the encoded features using the decoder model to get the transcription.
    Args:
        encoder_output: Output from the encoder model
        decoder_session: ONNX Runtime session for the decoder model
    Returns:
        str: Transcription or translated text
    """
    # Convert encoder output to float32 (if it's not already)
    encoder_hidden_states = encoder_output.astype(np.float32)

    # Tokenize the encoder output (this assumes you have a tokenizer available)
    token_ids = tokenizer.encode("decoded text from encoder", return_tensors="np").input_ids  # Example tokenization
    
    # Get decoder input names and output names
    decoder_input_name = decoder_session.get_inputs()[0].name
    decoder_hidden_state_name = decoder_session.get_inputs()[1].name  # This may vary depending on your model
    decoder_output_name = decoder_session.get_outputs()[0].name

    # Run inference on the decoder, passing token_ids (int64) and hidden states (float32)
    decoder_output = decoder_session.run([decoder_output_name], {
        decoder_input_name: token_ids,  # Token IDs (int64)
        decoder_hidden_state_name: encoder_hidden_states  # Hidden states (float32)
    })
    
    # Post-process decoder output (e.g., convert logits to tokens)
    transcription = np.argmax(decoder_output[0], axis=-1)  # Assuming logits
    
    return transcription

def audio_to_text(audio_path):
    """
    Complete flow for converting audio to text using Whisper (encoder + decoder).
    Args:
        audio_path: Path to the input audio file
    Returns:
        str: Transcription or translated text
    """
    # Step 1: Preprocess the audio
    audio = preprocess_audio(audio_path)
    
    # Step 2: Encode the audio using the encoder model
    encoder_output = encode_audio(audio, encoder_session)
    
    # Step 3: Decode the encoder output to get the transcription
    transcription = decode_output(encoder_output, decoder_session)
    
    return transcription

# Example usage:
#audio_path = "carlin_who_gives_x.wav"
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]
transcription = audio_to_text(sample)
print("Transcription:", transcription)




TypeError: Invalid file: {'path': '0d38672e0bbdbdc460af55b8bb84a15b2730db2819f2af64f9c777d4d586f2de', 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00024414, 0.00048828,
       0.0005188 ]), 'sampling_rate': 16000}

In [None]:

# Ensure you're using a compatible version of ONNX Runtime
print(f"ONNX Runtime Version: {ort.__version__}")

ONNX Runtime Version: 1.21.0
