In [1]:
#動態qat
from onnxruntime.quantization import quantize_dynamic, QuantType
 
 
model_fp32 ="/home/11126110/FPGA/CLAHE_yolov5n_experiment3/weights/yolov8_LP30_tooth.onnx"
model_int8 = 'test3_dynamic_quantized.onnx'
 
 
# Quantize 
quantize_dynamic(model_fp32, model_int8, weight_type=QuantType.QUInt8)




In [2]:
#靜態qat
import numpy as np
import cv2
import os
from onnxruntime.quantization import CalibrationDataReader, quantize_static, QuantType, QuantFormat
from glob import glob

class ImageCalibrationDataReader(CalibrationDataReader):
    def __init__(self, image_folder):
        self.image_paths = glob(os.path.join(image_folder, "*"))
        self.idx = 0
        self.input_name = "images"
    
    def preprocess(self, frame):
        # 圖像預處理
        frame = cv2.imread(frame)
        if frame is None:
            raise ValueError(f"無法讀取圖片: {frame}")
            
        X = cv2.resize(frame, (640, 640))
        image_data = np.array(X).astype(np.float32) / 255.0  # 正規化到 [0, 1]
        image_data = np.transpose(image_data, (2, 0, 1))  # (H, W, C) -> (C, H, W)
        image_data = np.expand_dims(image_data, axis=0)  # 添加批次維度
        return image_data
    
    def get_next(self):
        if self.idx >= len(self.image_paths):
            return None
        
        try:
            image_path = self.image_paths[self.idx]
            input_data = self.preprocess(image_path)
            self.idx += 1
            return {self.input_name: input_data}
        except Exception as e:
            print(f"處理圖片時發生錯誤 {image_path}: {str(e)}")
            self.idx += 1
            return self.get_next()

def main():
    # 設定路徑
    model_path = "/home/11126110/FPGA/CLAHE_yolov5n_experiment3/weights/yolov8_LP30_tooth.onnx"
    calibration_data_path = "/home/11126110/FPGA/CLAHE-image-crop/test/images"
    output_model_path = "static_quantized_model.onnx"
    
    # 確認文件是否存在
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"找不到模型文件: {model_path}")
    if not os.path.exists(calibration_data_path):
        raise FileNotFoundError(f"找不到校準數據文件夾: {calibration_data_path}")
    
    # 創建校準數據讀取器
    calibration_data_reader = ImageCalibrationDataReader(calibration_data_path)
    
    # 設定需要排除的節點
    #Concat、Split、Slice 操作：用於處理預測張量
    #Add、Sub、Div、Mul 操作：用於邊界框計算和調整
    #Sigmoid：非線性激活函數，量化可能導致精度損失
    #Softmax：用於類別預測的概率計算，需要保持精確度
    #Reshape、Transpose 操作：純粹的數據重組操作
    
    nodes_to_exclude = [
        '/model.22/Concat_3', '/model.22/Split', '/model.22/Sigmoid',
        '/model.22/dfl/Reshape', '/model.22/dfl/Transpose', '/model.22/dfl/Softmax', 
        '/model.22/dfl/conv/Conv', '/model.22/dfl/Reshape_1', '/model.22/Slice_1',
        '/model.22/Slice', '/model.22/Add_1', '/model.22/Sub', '/model.22/Div_1',
        '/model.22/Concat_4', '/model.22/Mul_2', '/model.22/Concat_5'
    ]
    
    try:
        print("開始進行靜態量化...")
        # 執行靜態量化
        quantize_static(
            model_path,
            output_model_path,
            weight_type=QuantType.QInt8,
            activation_type=QuantType.QUInt8,
            calibration_data_reader=calibration_data_reader,
            quant_format=QuantFormat.QDQ,
            nodes_to_exclude=nodes_to_exclude,
            per_channel=False,
            reduce_range=True
        )
        print(f"靜態量化完成! 輸出模型保存在: {output_model_path}")
        
    except Exception as e:
        print(f"量化過程中發生錯誤: {str(e)}")

if __name__ == "__main__":
    main()



開始進行靜態量化...




靜態量化完成! 輸出模型保存在: static_quantized_model.onnx


In [2]:
import torch
import torch.backends.cudnn as cudnn
if torch.cuda.is_available():
    cudnn.benchmark = True
    device = "cuda"
    print(torch.cuda.get_device_name())
else:
    device = "cpu"
    print("Use CPU")

Quadro RTX 3000 with Max-Q Design


In [6]:
#gpu驗證
import os
import cv2
import numpy as np
import onnxruntime as ort
from pathlib import Path
from tqdm import tqdm
import time
import torch

from ultralytics import YOLO
from ultralytics.utils import yaml_load
from ultralytics.utils.checks import check_yaml

class YOLOValidator:
    def __init__(self, model_path, image_size=640):
        self.image_size = image_size
        self.conf_threshold = 0.25
        self.iou_threshold = 0.45
        
        # 配置 GPU 推理選項
        providers = [
            ('CUDAExecutionProvider', {
                'device_id': torch.cuda.current_device(),
                'arena_extend_strategy': 'kNextPowerOfTwo',
                'gpu_mem_limit': 1 * 1024 * 1024 * 1024,   # 20GB GPU memory limit
                'cudnn_conv_algo_search': 'EXHAUSTIVE',     
                'do_copy_in_default_stream': True,
            }),
            # 'CPUExecutionProvider'
        ]
        
        # 創建啟用 GPU 的 session
        sess_options = ort.SessionOptions()
        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        sess_options.intra_op_num_threads = 1  # 設置內部操作的執行線程數
        sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
        
        self.session = ort.InferenceSession(
            model_path, 
            sess_options=sess_options,
            providers=providers
        )
        print("use: ", self.session.get_providers())
        
        self.input_name = self.session.get_inputs()[0].name
        
        # mAP計算用的IoU閾值
        self.iou_thresholds = np.linspace(0.5, 0.95, 10)  # [0.5, 0.55, ..., 0.95]

    def preprocess_image(self, image_path):
        img = cv2.imread(image_path)
        original_shape = img.shape[:2]
        
        img = cv2.resize(img, (self.image_size, self.image_size))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = img.astype(np.float32) / 255.0
        img = np.transpose(img, (2, 0, 1))
        img = np.expand_dims(img, axis=0)
        
        return img, original_shape

    
    def parse_label_line(self, line):
            values = list(map(float, line.strip().split()))
            class_id = int(values[0])
            points = []
            
            for i in range(1, len(values), 2):
                if i + 1 < len(values):
                    x, y = values[i], values[i + 1]
                    points.append((x, y))
                    
            return class_id, points

    def calculate_polygon_bbox(self, points):
        if not points:
            return None
            
        x_coords = [p[0] for p in points]
        y_coords = [p[1] for p in points]
        
        x_min, x_max = min(x_coords), max(x_coords)
        y_min, y_max = min(y_coords), max(y_coords)
        
        x_center = (x_min + x_max) / 2
        y_center = (y_min + y_max) / 2
        width = x_max - x_min
        height = y_max - y_min
        
        return [x_center, y_center, width, height]

    def post_process(self, outputs, original_shape):
        output = outputs[0][0]  # [5, 8400]
        predictions = output.transpose()  # [8400, 5]
        
        scores = predictions[:, 4]
        mask = scores > self.conf_threshold
        predictions = predictions[mask]
        
        if len(predictions) == 0:
            return []
        
        boxes = np.zeros_like(predictions[:, :4])
        boxes[:, 0] = predictions[:, 0] - predictions[:, 2] / 2  # x1
        boxes[:, 1] = predictions[:, 1] - predictions[:, 3] / 2  # y1
        boxes[:, 2] = predictions[:, 0] + predictions[:, 2] / 2  # x2
        boxes[:, 3] = predictions[:, 1] + predictions[:, 3] / 2  # y2
        
        scores = predictions[:, 4]
        indices = cv2.dnn.NMSBoxes(boxes.tolist(), scores.tolist(), 
                                 self.conf_threshold, 
                                 self.iou_threshold)
        
        scale_x = original_shape[1] / self.image_size
        scale_y = original_shape[0] / self.image_size
        
        results = []
        if len(indices) > 0:
            for idx in indices:
                box = boxes[idx]
                x1, y1, x2, y2 = box
                
                x1 *= scale_x
                x2 *= scale_x
                y1 *= scale_y
                y2 *= scale_y
                
                confidence = float(predictions[idx, 4])
                
                width = (x2 - x1)
                height = (y2 - y1)
                x_center = x1 + width / 2
                y_center = y1 + height / 2
                
                x_center /= original_shape[1]
                y_center /= original_shape[0]
                width /= original_shape[1]
                height /= original_shape[0]
                
                class_id = 0
                
                results.append([class_id, x_center, y_center, width, height, confidence])
        
        return results

    def calculate_iou(self, box1, box2):
        b1_x1 = box1[1] - box1[3] / 2
        b1_y1 = box1[2] - box1[4] / 2
        b1_x2 = box1[1] + box1[3] / 2
        b1_y2 = box1[2] + box1[4] / 2
        
        b2_x1 = box2[1] - box2[3] / 2
        b2_y1 = box2[2] - box2[4] / 2
        b2_x2 = box2[1] + box2[3] / 2
        b2_y2 = box2[2] + box2[4] / 2
        
        inter_x1 = max(b1_x1, b2_x1)
        inter_y1 = max(b1_y1, b2_y1)
        inter_x2 = min(b1_x2, b2_x2)
        inter_y2 = min(b1_y2, b2_y2)
        
        inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
        
        b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
        b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
        
        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
        
        return iou

    def calculate_map(self, all_predictions, all_ground_truths, iou_threshold):
        """計算單一IoU閾值下的mAP"""
        if not all_predictions or not all_ground_truths:
            return 0.0
        
        # 按置信度降序排序所有預測
        all_predictions.sort(key=lambda x: x[5], reverse=True)
        
        total_gts = len(all_ground_truths)
        detected = [False] * total_gts
        true_positives = []
        false_positives = []
        
        for pred in all_predictions:
            max_iou = 0
            max_idx = -1
            
            # 找到與當前預測框IoU最大的真實框
            for gt_idx, gt in enumerate(all_ground_truths):
                if not detected[gt_idx]:  # 只考慮未被檢測到的真實框
                    iou = self.calculate_iou(pred, gt)
                    if iou > max_iou:
                        max_iou = iou
                        max_idx = gt_idx
            
            # 根據IoU閾值判定是否為真陽性
            if max_iou >= iou_threshold and max_idx != -1:
                detected[max_idx] = True
                true_positives.append(1)
                false_positives.append(0)
            else:
                true_positives.append(0)
                false_positives.append(1)
        
        # 計算累積值
        cumsum_tp = np.cumsum(true_positives)
        cumsum_fp = np.cumsum(false_positives)
        
        # 計算查全率和查準率
        recalls = cumsum_tp / total_gts if total_gts > 0 else np.zeros_like(cumsum_tp)
        precisions = cumsum_tp / (cumsum_tp + cumsum_fp)
        
        # 計算AP
        ap = 0
        for t in np.arange(0, 1.1, 0.1):  # [0, 0.1, ..., 1.0]
            if np.sum(recalls >= t) == 0:
                p = 0
            else:
                p = np.max(precisions[recalls >= t])
            ap += p / 11
        
        return ap

    def validate(self, image_dir, label_dir):
        total_gt = 0
        total_pred = 0
        total_correct = 0
        total_iou = 0
        total_inference_time = 0
        
        all_predictions = []
        all_ground_truths = []
        
        image_paths = sorted(Path(image_dir).glob("*.*"))
        total_images = len(image_paths)
        
        for image_path in tqdm(image_paths, desc="驗證中"):
            label_path = Path(label_dir) / f"{image_path.stem}.txt"
            if not label_path.exists():
                continue
                
            gt_boxes = []
            with open(label_path, 'r') as f:
                for line in f:
                    class_id, points = self.parse_label_line(line)
                    bbox = self.calculate_polygon_bbox(points)
                    if bbox:
                        gt_boxes.append([class_id] + bbox)
            
            img, original_shape = self.preprocess_image(str(image_path))
            
            # 使用毫秒計算推理時間
            start_time = time.perf_counter()
            outputs = self.session.run(None, {self.input_name: img})
            inference_time = (time.perf_counter() - start_time) * 1000  # 轉換為毫秒
            total_inference_time += inference_time
            
            pred_boxes = self.post_process(outputs, original_shape)
            
            all_predictions.extend(pred_boxes)
            all_ground_truths.extend(gt_boxes)
            
            total_gt += len(gt_boxes)
            total_pred += len(pred_boxes)
            
            for gt_box in gt_boxes:
                max_iou = 0
                for pred_box in pred_boxes:
                    if gt_box[0] == pred_box[0]:
                        iou = self.calculate_iou(gt_box, pred_box)
                        max_iou = max(max_iou, iou)
                
                if max_iou > 0.5:
                    total_correct += 1
                    total_iou += max_iou
        
        precision = total_correct / total_pred if total_pred > 0 else 0
        recall = total_correct / total_gt if total_gt > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        avg_iou = total_iou / total_correct if total_correct > 0 else 0
        
        map50 = self.calculate_map(all_predictions, all_ground_truths, 0.5)
        
        map_list = []
        for iou_threshold in self.iou_thresholds:
            map_value = self.calculate_map(all_predictions, all_ground_truths, iou_threshold)
            map_list.append(map_value)
        map50_95 = np.mean(map_list) # change 
        
        # 計算平均推理時間（毫秒）
        avg_inference_time = total_inference_time / total_images
        
        return {
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1_score,
            'Average IOU': avg_iou,
            'mAP@50': map50,
            'mAP@50-95': map50_95,
            'Total GT': total_gt,
            'Total Predictions': total_pred,
            'Correct Detections': total_correct,
            'Total Inference Time (ms)': total_inference_time,
            'Average Inference Time (ms)': avg_inference_time
        }
    #目前最好/home/11126110/FPGA/aggoptimized_yolov8_quantized_model_int8.onnx
def main():
    model_path = r'runs\iter5_LPr30\step_4_finetune\weights\yolov8_LP30_tooth.onnx'
    quantized_model_path = r'C:\Users\user\Desktop\AI_npu\code\my_weights\static_quantized_model.onnx'
    image_dir = r"C:\Users\user\Desktop\AI_npu\code\tooth_dataset\valid3\images"
    label_dir = r"C:\Users\user\Desktop\AI_npu\code\tooth_dataset\valid3\labels"
    
    print("\nAvailable Providers:", ort.get_available_providers())
    print("Device: ", ort.get_device())
    
    print("\n=== 驗證原始模型 (GPU) ===")
    validator = YOLOValidator(model_path)
    results = validator.validate(image_dir, label_dir)
    # pruning_cfg = yaml_load(check_yaml(r"C:\Users\user\Desktop\AI_npu\code\tooth_dataset\default.yaml"))
    # onnx_model = YOLO(model_path)
    # onnx_model.val(**pruning_cfg)
    
    print("\n原始模型結果:")
    for metric, value in results.items():
        if 'Time' in metric:
            print(f"{metric}: {value:.2f} ms")
        else:
            print(f"{metric}: {value:.4f}")
    
    print("\n=== 驗證量化後模型 (GPU) ===")
    validator_quantized = YOLOValidator(quantized_model_path)
    results_quantized = validator_quantized.validate(image_dir, label_dir)
    # onnx_model = YOLO(quantized_model_path)
    # onnx_model.val(**pruning_cfg)
    
    print("\n量化後模型結果:")
    for metric, value in results_quantized.items():
        if 'Time' in metric:
            print(f"{metric}: {value:.2f} ms")
        else:
            print(f"{metric}: {value:.4f}")

if __name__ == "__main__":
    main()


Available Providers: ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
Device:  GPU

=== 驗證原始模型 (GPU) ===
use:  ['CUDAExecutionProvider', 'CPUExecutionProvider']


驗證中: 100%|██████████| 6/6 [00:01<00:00,  3.95it/s]



原始模型結果:
Precision: 0.9600
Recall: 1.0000
F1-Score: 0.9796
Average IOU: 0.8930
mAP@50: 1.0000
mAP@50-95: 0.8294
Total GT: 24.0000
Total Predictions: 25.0000
Correct Detections: 24.0000
Total Inference Time (ms): 1428.88 ms
Average Inference Time (ms): 238.15 ms

=== 驗證量化後模型 (GPU) ===
use:  ['CUDAExecutionProvider', 'CPUExecutionProvider']


驗證中: 100%|██████████| 6/6 [00:01<00:00,  5.26it/s]


量化後模型結果:
Precision: 0.9600
Recall: 1.0000
F1-Score: 0.9796
Average IOU: 0.8743
mAP@50: 1.0000
mAP@50-95: 0.7820
Total GT: 24.0000
Total Predictions: 25.0000
Correct Detections: 24.0000
Total Inference Time (ms): 1044.89 ms
Average Inference Time (ms): 174.15 ms





In [5]:
!nvidia-smi

Tue Jan 21 23:04:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.97                 Driver Version: 555.97         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Quadro RTX 3000 with Max...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   45C    P5              9W /   60W |      91MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                