##### Copyright 2024 The IREE Authors

In [1]:
#@title Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

# <img src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png" height="20px"> Hugging Face to <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/10/PyTorch_logo_icon.svg/640px-PyTorch_logo_icon.svg.png" height="20px"> PyTorch to <img src="https://raw.githubusercontent.com/iree-org/iree/main/docs/website/docs/assets/images/IREE_Logo_Icon_Color.svg" height="20px"> IREE

This notebook uses [iree-turbine](https://github.com/iree-org/iree-turbine) to export a pretrained [Hugging Face Transformers](https://huggingface.co/docs/transformers/) model to [IREE](https://github.com/iree-org/iree), leveraging [torch-mlir](https://github.com/llvm/torch-mlir) under the covers.

* The pretrained [whisper-small](https://huggingface.co/openai/whisper-small)
  model is showcased here as it is small enough to fit comfortably into a Colab
  notebook. Other pretrained models can be found at
  https://huggingface.co/docs/transformers/index.

## Setup

In [2]:
%%capture
#@title Uninstall existing packages
#   This avoids some warnings when installing specific PyTorch packages below.
!python -m pip uninstall -y fastai torchaudio torchdata torchtext torchvision

In [3]:
!python -m pip install --pre --index-url https://download.pytorch.org/whl/cpu --upgrade torch==2.5.0

Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch==2.5.0
  Downloading https://download.pytorch.org/whl/cpu/torch-2.5.0%2Bcpu-cp310-cp310-linux_x86_64.whl (174.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 2.5.1+cu121
    Uninstalling torch-2.5.1+cu121:
      Successfully uninstalled torch-2.5.1+cu121
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
timm 1.0.12 requires torchvision, which is not installed.[0m[31m
[0mSuccessfully installed torch-2.5.0+cpu


In [4]:
!python -m pip install iree-turbine

Collecting iree-turbine
  Downloading iree_turbine-3.1.0-py3-none-any.whl.metadata (6.7 kB)
Collecting iree-base-compiler (from iree-turbine)
  Downloading iree_base_compiler-3.1.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting iree-base-runtime (from iree-turbine)
  Downloading iree_base_runtime-3.1.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting ml_dtypes>=0.5.0 (from iree-turbine)
  Downloading ml_dtypes-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Downloading iree_turbine-3.1.0-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.7/301.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ml_dtypes-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading iree_base_compiler-3.1.0-cp310

In [5]:
#@title Report version information
!echo "Installed iree-turbine, $(python -m pip show iree_turbine | grep Version)"

!echo -e "\nInstalled IREE, compiler version information:"
!iree-compile --version

import torch
print("\nInstalled PyTorch, version:", torch.__version__)

Installed iree-turbine, Version: 3.1.0

Installed IREE, compiler version information:
IREE (https://iree.dev):
  IREE compiler version 3.1.0rc20250107 @ d2242207764230ad398585a5771f9d54ce91b4c8
  LLVM version 20.0.0git
  Optimized build

Installed PyTorch, version: 2.5.0+cpu


## Load and run whisper-small

Load the pretrained model from https://huggingface.co/openai/whisper-small.

See also:

* Model card: https://huggingface.co/docs/transformers/model_doc/whisper
* Test case in [SHARK-TestSuite](https://github.com/nod-ai/SHARK-TestSuite/): [`pytorch/models/whisper-small/model.py`](https://github.com/nod-ai/SHARK-TestSuite/blob/main/e2eshark/pytorch/models/whisper-small/model.py)

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# https://huggingface.co/docs/transformers/model_doc/auto
# AutoModelForCausalLM -> WhisperForCausalLM
# AutoTokenizer        -> WhisperTokenizerFast

modelname = "openai/whisper-small"
tokenizer = AutoTokenizer.from_pretrained(modelname)

# Some of the options here affect how the model is exported. See the test cases
# at https://github.com/nod-ai/SHARK-TestSuite/tree/main/e2eshark/pytorch/models
# for other options that may be useful to set.
model = AutoModelForCausalLM.from_pretrained(
    modelname,
    output_attentions=False,
    output_hidden_states=False,
    attn_implementation="eager",
    torchscript=True,
)

# This is just a simple demo to get some data flowing through the model.
# Depending on this model and what input it expects (text, image, audio, etc.)
# this might instead use a specific Processor class. For Whisper,
# WhisperProcessor runs audio input pre-processing and output post-processing.
example_prompt = "Hello world!"
example_encoding = tokenizer(example_prompt, return_tensors="pt")
example_input = example_encoding["input_ids"].cpu()
example_args = (example_input,)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

Some weights of WhisperForCausalLM were not initialized from the model checkpoint at openai/whisper-small and are newly initialized: ['proj_out.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

Test exporting using [`torch.export()`](https://pytorch.org/docs/stable/export.html#torch.export.export). If `torch.export` works, `aot.export()` from Turbine should work as well.

In [7]:
import torch
exported_program = torch.export.export(model, example_args)

Export using the simple [`aot.export()`](https://iree.dev/guides/ml-frameworks/pytorch/#simple-api) API from Turbine.

In [8]:
import iree.turbine.aot as aot
# Note: aot.export() wants the example args to be unpacked.
whisper_compiled_module = aot.export(model, *example_args)

Compile using Turbine/IREE then run the program.

In [9]:
binary = whisper_compiled_module.compile(save_to=None)

import iree.runtime as ireert
config = ireert.Config("local-task")
vm_module = ireert.load_vm_module(
    ireert.VmModule.wrap_buffer(config.vm_instance, binary.map_memory()),
    config,
)

iree_outputs = vm_module.main(example_args[0])
print(iree_outputs[0].to_host())

[[[  5.8126216   3.9667568   4.5749426 ...   2.7658575   2.6436937
     1.5479789]
  [  7.5634375   6.029962    5.1000347 ...   6.432704    6.101554
     6.4348   ]
  [  0.9380306  -4.4696145  -4.012748  ...  -6.2486286  -7.7917867
    -6.8453736]
  [  0.7450936  -3.7631674  -7.4870253 ...  -6.734828   -6.966235
   -10.022404 ]
  [ -0.9628601  -3.510199   -6.015854  ...  -7.116391   -6.7086434
   -10.225704 ]
  [  3.347097    2.4927166  -3.3042672 ...  -1.5709717  -1.8455461
    -2.9991992]]]


Run the program using native PyTorch to compare outputs.

In [10]:
torch_outputs = model(example_args[0])
print(torch_outputs[0].detach().numpy())

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


[[[  5.8126183    3.9667587    4.5749483  ...   2.7658575    2.643694
     1.5479784 ]
  [  7.563436     6.029952     5.100036   ...   6.4327083    6.101557
     6.4348083 ]
  [  0.93802685  -4.469646    -4.012787   ...  -6.2486415   -7.7918167
    -6.8453975 ]
  [  0.74507916  -3.763197    -7.487034   ...  -6.734877    -6.966276
   -10.022424  ]
  [ -0.96288276  -3.510221    -6.0158725  ...  -7.1164136   -6.708687
   -10.225745  ]
  [  3.3470666    2.492654    -3.304323   ...  -1.5709934   -1.8455791
    -2.9992423 ]]]
