# 2025 EAI Lab 5

## Topic 1 : From PyTorch To ONNX

### Steps:
1.   Define Model Architecture
2.   Load Weight
3.   Export ONNX File
4.   Quantize To INT8
5.   Building Session



In [None]:
# !pip install -U \
#     torch torchvision torchaudio \
#     onnx onnxscript onnxruntime onnxruntime-tools onnxruntime-gpu \
#     gradio

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# TODO
# Design Your ResNet18 Model
# 參考 resnet18.py 來建立模型架構

class BasicBlock(nn.Module):
    """ResNet 的基本區塊"""
    def __init__(self, inchannel, outchannel, stride=1):
        super(BasicBlock, self).__init__()
        # 左邊的主要路徑：兩個 conv + bn，中間有 relu
        self.left = nn.Sequential(
            nn.Conv2d(inchannel, outchannel, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(outchannel),
            nn.ReLU(inplace=True),
            nn.Conv2d(outchannel, outchannel, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(outchannel)
        )
        
        # shortcut：如果維度不一樣就要用 1x1 conv 調整
        self.shortcut = nn.Sequential()
        if stride != 1 or inchannel != outchannel:
            self.shortcut = nn.Sequential(
                nn.Conv2d(inchannel, outchannel, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(outchannel)
            )

    def forward(self, x):
        # 殘差連接：把輸入加回來
        out = self.left(x)
        out = out + self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet18(nn.Module):
    """ResNet18 模型，用於 CIFAR-10 分類"""
    def __init__(self, ResBlock, num_classes=10):
        super(ResNet18, self).__init__()
        self.inchannel = 64
        
        # 第一層 conv
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        
        # 四個 layer，通道數分別是 64, 128, 256, 512
        self.layer1 = self.make_layer(ResBlock, 64, 2, stride=1)
        self.layer2 = self.make_layer(ResBlock, 128, 2, stride=2)
        self.layer3 = self.make_layer(ResBlock, 256, 2, stride=2)
        self.layer4 = self.make_layer(ResBlock, 512, 2, stride=2)
        
        # 最後的全連接層，輸出 10 個類別
        self.fc = nn.Linear(512, num_classes)

    def make_layer(self, block, channels, num_blocks, stride):
        """建立一個 layer，裡面有 num_blocks 個 block"""
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for s in strides:
            layers.append(block(self.inchannel, channels, s))
            self.inchannel = channels
        return nn.Sequential(*layers)

    def forward(self, x):
        # 依序通過各層
        out = self.conv1(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)  # 平均池化
        out = out.view(out.size(0), -1)  # 攤平成一維
        out = self.fc(out)
        return out


In [2]:

torch_model = ResNet18(ResBlock=BasicBlock, num_classes=10)
dummy_input = (torch.randn(1, 3, 32, 32),)  # 假的輸入，用來讓 ONNX 知道輸入大小

def export_onnx(model, dummy, path):
  """載入權重然後匯出成 ONNX"""
  # 讀取老師給的預訓練權重
  state = torch.load(path, map_location=torch.device("cpu"))

  # TODO : load state dict
  # 這邊要過濾掉一些不需要的 key（像是 total_ops 這種分析用的）
  model_state = {k: v for k, v in state.items() if not k.endswith(('total_ops', 'total_params'))}
  model.load_state_dict(model_state, strict=False)

  # 切換成 eval 模式，這樣 BN 層才會正常
  torch_model.eval()

  # Todo : Export ONNX FILE
  import os
  output_path = "N26135011_FP32.onnx"
  
  # 先把舊的檔案刪掉，避免出問題
  if os.path.exists(output_path):
      os.remove(output_path)
  if os.path.exists(output_path + ".data"):
      os.remove(output_path + ".data")
  
  # 匯出 ONNX 模型
  torch.onnx.export(
      model,                    # 我們的模型
      dummy,                    # 假輸入 (1, 3, 32, 32)
      output_path,              # 輸出檔名
      export_params=True,       # 要匯出參數
      opset_version=13,         # ONNX 版本
      do_constant_folding=True, # 優化用的
      input_names=["input"],    # 輸入名稱
      output_names=["output"],  # 輸出名稱
      dynamic_axes={            # 讓 batch size 可以變動
          'input': {0: 'batch_size'},
          'output': {0: 'batch_size'}
      }
  )
  
  # 如果模型太大被分割成兩個檔案，要合併回來
  if os.path.exists(output_path + ".data"):
      print(f"⚠️ 模型被分割了，正在合併...")
      import onnx
      
      # 載入然後重新存成單一檔案
      onnx_model = onnx.load(output_path, load_external_data=True)
      os.remove(output_path)
      os.remove(output_path + ".data")
      onnx.save(onnx_model, output_path)
      print(f"✓ 合併完成: {output_path}")
  else:
      print(f"✓ FP32 模型匯出成功: {output_path}")

if __name__ == "__main__":
  # 提醒 : 記得先把 best_model.pth 上傳到 Content 資料夾
  export_onnx(model=torch_model, dummy=dummy_input, path="best_model.pth")


  torch.onnx.export(
W1205 15:46:41.452000 28492 site-packages/torch/onnx/_internal/exporter/_compat.py:114] Setting ONNX exporter to use operator set version 18 because the requested opset_version 13 is a lower version than we have implementations for. Automatic version conversion will be performed, which may not be successful at converting to the requested version. If version conversion is unsuccessful, the opset version of the exported model will be kept at 18. Please consider setting opset_version >=18 to leverage latest ONNX features


[torch.onnx] Obtain model graph for `ResNet18([...]` with `torch.export.export(..., strict=False)`...
[torch.onnx] Obtain model graph for `ResNet18([...]` with `torch.export.export(..., strict=False)`... ✅
[torch.onnx] Run decomposition...


The model version conversion is not supported by the onnxscript version converter and fallback is enabled. The model will be converted using the onnx C API (target version: 13).
Failed to convert the model to the target version 13 using the ONNX C API. The model was not modified
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniconda/base/envs/eai_lab5/lib/python3.10/site-packages/onnxscript/version_converter/__init__.py", line 127, in call
    converted_proto = _c_api_utils.call_onnx_api(
  File "/opt/homebrew/Caskroom/miniconda/base/envs/eai_lab5/lib/python3.10/site-packages/onnxscript/version_converter/_c_api_utils.py", line 65, in call_onnx_api
    result = func(proto)
  File "/opt/homebrew/Caskroom/miniconda/base/envs/eai_lab5/lib/python3.10/site-packages/onnxscript/version_converter/__init__.py", line 122, in _partial_convert_version
    return onnx.version_converter.convert_version(
  File "/opt/homebrew/Caskroom/miniconda/base/envs/eai_lab5/lib/python3.10/si

[torch.onnx] Run decomposition... ✅
[torch.onnx] Translate the graph into ONNX...
[torch.onnx] Translate the graph into ONNX... ✅
Applied 41 of general pattern rewrite rules.
⚠️ 模型被分割了，正在合併...
✓ 合併完成: N26135011_FP32.onnx


In [3]:
import os, numpy as np
from PIL import Image
import onnxruntime as ort
from onnxruntime.quantization import CalibrationDataReader

CIFAR10_MEAN = np.array([0.4914, 0.4822, 0.4465], dtype=np.float32)
CIFAR10_STD  = np.array([0.2470, 0.2435, 0.2616], dtype=np.float32)

def preprocess_32x32(pil_img: Image.Image) -> np.ndarray:
    arr = np.asarray(pil_img.convert("RGB").resize((32, 32)), dtype=np.float32) / 255.0
    arr = (arr - CIFAR10_MEAN) / CIFAR10_STD
    return arr.transpose(2, 0, 1)[None, ...]  # (1,3,32,32)

class CIFARLikeCalibReader(CalibrationDataReader):
    def __init__(self, image_dir: str = None, input_name: str = "input",
                 batch_size: int = 32, num_batches: int = 10):
        self.input_name  = input_name
        self.batch_size  = batch_size
        self.num_batches = num_batches
        self.paths = []
        if image_dir and os.path.isdir(image_dir):
            for f in os.listdir(image_dir):
                if f.lower().endswith((".jpg", ".jpeg", ".png", ".bmp")):
                    self.paths.append(os.path.join(image_dir, f))
        self._mode_random = len(self.paths) == 0
        self._pos = 0
        self._emitted = 0

    def get_next(self):
        if self._emitted >= self.num_batches:
            return None
        if self._mode_random:
            batch = np.random.randn(self.batch_size, 3, 32, 32).astype(np.float32)
        else:
            items = []
            for _ in range(self.batch_size):
                if self._pos >= len(self.paths):
                    break
                img = Image.open(self.paths[self._pos])
                self._pos += 1
                items.append(preprocess_32x32(img))
            if not items:
                return None
            batch = np.concatenate(items, axis=0).astype(np.float32)
        self._emitted += 1
        return {self.input_name: batch}

    def rewind(self):
        self._pos = 0
        self._emitted = 0

FP32_MODEL = "N26135011_FP32.onnx"
INT8_MODEL = "N26135011_INT8.onnx"


_tmp = ort.InferenceSession(FP32_MODEL, providers=["CPUExecutionProvider"])
INPUT_NAME = _tmp.get_inputs()[0].name
print("Calib will use input name:", INPUT_NAME)


Calib will use input name: input


In [4]:
from onnxruntime.quantization import quantize_static, QuantType, CalibrationMethod


# 建立校準用的 reader（這邊用隨機資料來校準）
reader = CIFARLikeCalibReader(
    image_dir=None,          # 沒有圖片就用隨機資料
    input_name=INPUT_NAME,   # 輸入的名稱
    batch_size=1,            # 一次處理 1 張
    num_batches=50           # 總共跑 50 次
)


def quantize_to_int8(fp32_path, int8_path, reader, method="MinMax"):
    """把 FP32 模型量化成 INT8"""
    # Todo : quantize_static
    # 用 onnxruntime 的靜態量化功能
    quantize_static(
        model_input=fp32_path,                      # 輸入的 FP32 模型
        model_output=int8_path,                     # 輸出的 INT8 模型
        calibration_data_reader=reader,             # 校準用的 reader
        quant_format="QOperator",                   # 量化格式
        per_channel=True,                           # per-channel 量化比較準
        weight_type=QuantType.QInt8,                # 權重用 INT8
        activation_type=QuantType.QInt8,            # activation 也用 INT8
        calibrate_method=CalibrationMethod.MinMax   # 用 MinMax 方法校準
    )
    print("✓ INT8 模型量化完成:", int8_path)

quantize_to_int8(FP32_MODEL, INT8_MODEL, reader)



✓ INT8 模型量化完成: N26135011_INT8.onnx


In [5]:
import time
import numpy as np
import onnxruntime as ort

def run(sess, x):
    """跑一次推論"""
    return sess.run(None, {sess.get_inputs()[0].name: x})[0]

x_demo = np.random.randn(1,3,32,32).astype(np.float32)  # 測試用的假資料

# Todo : build session function
def build_session(model_path, providers):
  """建立 ONNX Runtime 的 session 來跑推論"""
  return ort.InferenceSession(model_path, providers=providers)



sess_fp32 = build_session(model_path=FP32_MODEL, providers=["CPUExecutionProvider"])
sess_int8 = build_session(model_path=INT8_MODEL, providers=["CPUExecutionProvider"])

y_fp32 = run(sess_fp32, x_demo)
y_int8 = run(sess_int8, x_demo)

l2_rel = np.linalg.norm(y_fp32 - y_int8) / (np.linalg.norm(y_fp32) + 1e-12)
print(f"[Check] relative L2 diff FP32 vs INT8: {l2_rel:.6f}")

def bench(sess, x, n=50):
    t0 = time.time()
    for _ in range(n):
        sess.run(None, {sess.get_inputs()[0].name: x})
    return (time.time() - t0) / n

print("FP32 avg sec:", bench(sess_fp32, x_demo))
print("INT8 avg sec:", bench(sess_int8, x_demo))

so = ort.SessionOptions()
so.enable_profiling = True



[Check] relative L2 diff FP32 vs INT8: 0.005177
FP32 avg sec: 0.0069497632980346676
INT8 avg sec: 0.0011230850219726563


## Topic 2 : Gradio


In [6]:
# ! pip install gradio

In [7]:
import onnxruntime as ort
import numpy as np
from PIL import Image
import gradio as gr
import time
import os

# ====== 檢查環境 ======
print(f"當前工作目錄: {os.getcwd()}")

# ====== 設定 ======
MODEL_PATH_INT8 = "N26135011_INT8.onnx"   # INT8 模型
MODEL_PATH_FP32 = "N26135011_FP32.onnx"   # FP32 模型
LABELS = ['plane','car','bird','cat','deer','dog','frog','horse','ship','truck']  # CIFAR-10 的 10 個類別

# 檢查模型檔案有沒有存在
print(f"FP32 模型: {os.path.exists(MODEL_PATH_FP32)} - {MODEL_PATH_FP32}")
print(f"INT8 模型: {os.path.exists(MODEL_PATH_INT8)} - {MODEL_PATH_INT8}")
if os.path.exists(MODEL_PATH_FP32 + '.data'):
    print(f"⚠️ 有 .data 檔案，請重跑 Cell 4")

# CIFAR-10 的正規化參數
CIFAR10_MEAN = np.array([0.4914, 0.4822, 0.4465], dtype=np.float32)
CIFAR10_STD  = np.array([0.2470, 0.2435, 0.2616], dtype=np.float32)

# ====== 工具函數 ======
def softmax_np(x: np.ndarray) -> np.ndarray:
    """把 logits 轉成機率"""
    x = x - np.max(x)
    ex = np.exp(x)
    return ex / np.sum(ex)

# TODO : preprocess input image function
def preprocess(image: Image.Image) -> np.ndarray:
    """
    圖片前處理，把上傳的圖片轉成模型要的格式
    輸入: PIL Image
    輸出: (1, 3, 32, 32) 的 numpy array
    """
    if not isinstance(image, Image.Image):
        raise ValueError("請上傳圖片")
    
    # 1. 轉成 RGB 然後縮放到 32x32
    img = image.convert("RGB").resize((32, 32))
    
    # 2. 轉成 numpy 並除以 255 變成 0~1
    arr = np.asarray(img, dtype=np.float32) / 255.0
    
    # 3. 用 CIFAR-10 的 mean 和 std 做標準化
    arr = (arr - CIFAR10_MEAN) / CIFAR10_STD
    
    # 4. 把 HWC 轉成 CHW（PyTorch 格式）
    arr = arr.transpose(2, 0, 1)
    
    # 5. 加上 batch 維度變成 (1, 3, 32, 32)
    arr = arr[None, ...]
    
    return arr

# ====== 建立 Session ======
def build_session(model_path, providers):
    """載入 ONNX 模型"""
    return ort.InferenceSession(model_path, providers=providers)

# ====== 載入模型 ======
providers = ort.get_available_providers()
print(f"可用的 providers: {providers}")

sess_int8 = build_session(MODEL_PATH_INT8, providers=providers)
in_int8  = sess_int8.get_inputs()[0].name
out_int8 = sess_int8.get_outputs()[0].name

# 載入 FP32 模型（用 try 包起來以防出錯）
try:
    sess_fp32 = build_session(MODEL_PATH_FP32, providers=providers)
    in_fp32  = sess_fp32.get_inputs()[0].name
    out_fp32 = sess_fp32.get_outputs()[0].name
    _fp32_err = ""
    print("✓ 兩個模型都載入成功了")
except Exception as e:
    sess_fp32, in_fp32, out_fp32 = None, None, None
    _fp32_err = f"FP32 載入失敗: {e}"
    print(f"✗ {_fp32_err}")

# ====== 比較 FP32 和 INT8 ======
# TODO : Compare FP32 and INT8
def compare_fp32_int8(image: Image.Image):
    """比較兩個模型的預測結果和速度"""
    if image is None:
        return {}, {}, "請上傳圖片"
    if sess_fp32 is None:
        return {}, {}, _fp32_err or "FP32 模型沒載入成功"

    # 先做前處理
    x = preprocess(image)

    # Your progarm
    # 跑 FP32 推論，順便計時
    t0 = time.time()
    result_fp32 = sess_fp32.run([out_fp32], {in_fp32: x})[0]
    fp32_ms = (time.time() - t0) * 1000
    
    # 跑 INT8 推論，順便計時
    t0 = time.time()
    result_int8 = sess_int8.run([out_int8], {in_int8: x})[0]
    int8_ms = (time.time() - t0) * 1000

    # 用 softmax 轉成機率
    p_fp32 = softmax_np(result_fp32[0])
    p_int8 = softmax_np(result_int8[0])

    def top3_map(p):
        """拿出前 3 名的預測"""
        idx = np.argpartition(p, -3)[-3:]
        idx = idx[np.argsort(p[idx])[::-1]]
        return {LABELS[i]: float(p[i]) for i in idx}

    top3_fp32 = top3_map(p_fp32)
    top3_int8 = top3_map(p_int8)

    # 整理結果
    summary = (
        f"FP32 推論時間: {fp32_ms:.2f} ms\n"
        f"INT8 推論時間: {int8_ms:.2f} ms\n"
        f"加速比 (FP32/INT8): {(fp32_ms / max(int8_ms, 1e-9)):.2f}×"
    )
    return top3_fp32, top3_int8, summary

# ====== Gradio 介面 ======
# TODO : Building GUI Interface
demo = gr.Interface(
    fn=compare_fp32_int8,           # 要跑的函數
    inputs=gr.Image(type="pil"),    # 輸入是圖片
    outputs=[                       # 輸出有三個
        gr.Label(label="FP32 Top-3"),       # FP32 的前 3 名
        gr.Label(label="INT8 Top-3"),       # INT8 的前 3 名
        gr.Textbox(label="效能比較")         # 時間比較
    ],
    title="CIFAR-10 分類器 - FP32 vs INT8 比較",
    description="上傳圖片來比較 FP32 和 INT8 模型\n支援：飛機、汽車、鳥、貓、鹿、狗、青蛙、馬、船、卡車"
)

if __name__ == "__main__":
  # TODO : building a public web
  # 啟動網頁，share=True 會產生公開網址
  demo.launch(share=True)



  from .autonotebook import tqdm as notebook_tqdm


當前工作目錄: /Users/jiaquan/Development/2025EAI_Project/EAI_Lab5
FP32 模型: True - N26135011_FP32.onnx
INT8 模型: True - N26135011_INT8.onnx
可用的 providers: ['CoreMLExecutionProvider', 'AzureExecutionProvider', 'CPUExecutionProvider']


2025-12-05 15:46:45.786 python[28492:2897037] 2025-12-05 15:46:45.785717 [W:onnxruntime:, coreml_execution_provider.cc:113 GetCapability] CoreMLExecutionProvider::GetCapability, number of partitions supported by CoreML: 9 number of nodes in the graph: 148 number of nodes supported by CoreML: 9
2025-12-05 15:46:46.218 python[28492:2897037] 2025-12-05 15:46:46.218599 [W:onnxruntime:, coreml_execution_provider.cc:113 GetCapability] CoreMLExecutionProvider::GetCapability, number of partitions supported by CoreML: 2 number of nodes in the graph: 50 number of nodes supported by CoreML: 47


✓ 兩個模型都載入成功了
* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://a25cf08bab330a699c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
