In [None]:
import os
import numpy as np
import tensorflow as tf
from keras.datasets import mnist
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score

# ============================
# 1. 数据预处理与保存
# ============================
def prepare_and_save_data():
    # 载入 MNIST 数据集（已经分好训练集和测试集）
    (X_train, y_train), (X_test, y_test) = mnist.load_data()

    # 将像素值转换为浮点数并归一化到 [0, 1]
    X_train = X_train.astype("float32") / 255.0
    X_test  = X_test.astype("float32") / 255.0

    # MNIST 原始数据 shape 为 (样本数, 28, 28)，扩展一个通道维度使其符合 CNN 输入格式 (28, 28, 1)
    X_train = np.expand_dims(X_train, axis=-1)
    X_test  = np.expand_dims(X_test, axis=-1)

    # 对标签进行 one-hot 编码，类别数为 10
    y_train = to_categorical(y_train, num_classes=10)
    y_test  = to_categorical(y_test, num_classes=10)
    
    # 将预处理后的数据存储到 .npy 文件中
    np.save("X_train.npy", X_train)
    np.save("y_train.npy", y_train)
    np.save("pynq-z2/X_test.npy", X_test)
    np.save("pynq-z2/y_test.npy", y_test)
    print("数据已保存到 .npy 文件中。")

# ============================
# 2. 加载 .npy 数据文件
# ============================
def load_data_from_npy():
    X_train = np.load("X_train.npy")
    y_train = np.load("y_train.npy")
    X_test = np.load("pynq-z2/X_test.npy")
    y_test = np.load("pynq-z2/y_test.npy")
    print(y_test[0])
    return X_train, y_train, X_test, y_test

# ============================
# 主程序：准备数据、构建模型、训练与测试
# ============================
# 如果 .npy 文件不存在，则先生成并保存预处理数据
if not (os.path.exists("X_train.npy") and os.path.exists("y_train.npy") and \
        os.path.exists("pynq-z2/X_test.npy") and os.path.exists("pynq-z2/y_test.npy")):
    prepare_and_save_data()
    
# 加载预处理的数据
X_train, y_train, X_test, y_test = load_data_from_npy()
print(X_train.shape[1:])

2025-04-10 09:46:42.258367: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-10 09:46:42.282408: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-10 09:46:42.403172: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-10 09:46:42.403203: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-10 09:46:42.403928: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
(28, 28, 1)


In [None]:
from keras.layers import Activation, MaxPooling2D, Flatten, Dense
from qkeras import QDense, QConv2D, quantized_bits
from keras.models import Sequential

# 定义全局量化参数，方便后续修改
# QUANT_BITS 表示总共位数，QUANT_INTEGER 表示整数位数
QUANT_BITS = 4
QUANT_INTEGER = 0

def get_quantizer():
    """
    根据全局的量化参数返回量化器。
    修改 QUANT_BITS、QUANT_INTEGER、QUANT_PRECISION 变量即可全局改变量化策略。
    """
    return quantized_bits(QUANT_BITS, QUANT_INTEGER, 1)

# ============================
# 3. 构建 QKeras 模型
# ============================
def build_model(input_shape):
    # 获取量化器，所有涉及量化的层使用相同的量化参数
    quantizer = get_quantizer()
    
    model = Sequential()
    
    # 第一层卷积层，采用量化（kernel 和 bias 均使用相同量化器）
    model.add(QConv2D(
        filters=2,
        kernel_size=(3, 3),
        kernel_quantizer=quantizer,
        bias_quantizer=quantizer,
        input_shape=input_shape,
        padding="same",
        name="qconv1"
    ))
    model.add(Activation("relu"))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    # 第二层卷积层
    model.add(QConv2D(
        filters=1,
        kernel_size=(3, 3),
        kernel_quantizer=quantizer,
        bias_quantizer=quantizer,
        padding="same",
        name="qconv2"
    ))
    model.add(Activation("relu"))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Flatten())
    
    # 全连接层，使用 QDense 进行量化
    model.add(QDense(
        units=16,
        kernel_quantizer=quantizer,
        bias_quantizer=quantizer,
        name="qdense1"
    ))
    model.add(Activation("relu"))
    
    # 输出层，10 类（数字 0-9），采用 softmax 激活函数
    model.add(QDense(10, kernel_quantizer=quantizer, bias_quantizer=quantizer, activation="softmax", name="output"))
    
    # 编译模型，使用 adam 优化器和 categorical_crossentropy 损失函数
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    return model
    
# 构建 QKeras 模型，注意输入数据的 shape 为 (28, 28, 1)
qmodel = build_model(input_shape=X_train.shape[1:])
qmodel.summary()  # 打印模型结构

# 开始训练，设置 epochs、batch_size 和 validation_split
qmodel.fit(X_train, y_train, epochs=15, batch_size=128, validation_split=0.1)
    
# 使用测试集评估模型
loss, accuracy = qmodel.evaluate(X_test, y_test)
print("测试集损失：", loss)
print("测试集准确率：", accuracy)

In [None]:
import tensorflow_model_optimization as tfmot

# 创建一个剪枝策略
pruning_params = {
    'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(
        initial_sparsity=0.00,
        final_sparsity=0.50,
        begin_step=0,
        end_step=len(X_train) * 15 // 32)  # 30个epoch，batch_size为32
}
# 对模型进行剪枝
model_for_pruning = tfmot.sparsity.keras.prune_low_magnitude(qmodel, **pruning_params)
model_for_pruning.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])

# 添加一个剪枝回调
callbacks = [
    tfmot.sparsity.keras.UpdatePruningStep(),
    tfmot.sparsity.keras.PruningSummaries(log_dir='/tmp/logs')
]
# 训练剪枝后的模型
model_for_pruning.fit(X_train, y_train, epochs=15, batch_size=32, validation_split=0.1, callbacks=callbacks)

# 去掉剪枝部分，导出普通模型
pmodel = tfmot.sparsity.keras.strip_pruning(model_for_pruning)
pmodel.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])

# 再次进行预测和评估
loss, accuracy = pmodel.evaluate(X_test, y_test)
print("测试集损失：", loss)
print("测试集准确率：", accuracy)

# 保存模型
pmodel.save('test.h5')

In [2]:
from keras.models import load_model
from qkeras.utils import _add_supported_quantized_objects
# 加载模型
co = {}
_add_supported_quantized_objects(co)
model = load_model('test.h5', custom_objects=co)
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(y_test, axis=1)
accuracy = accuracy_score(true_classes, predicted_classes)
print(f'模型在测试集上的准确率: {accuracy * 100:.2f}%')

模型在测试集上的准确率: 93.81%


In [3]:
# 测试python推理速度
predictions = model.predict(X_test)



In [4]:
from hls4ml.converters import convert_from_keras_model
from hls4ml.utils import config_from_keras_model

# 将模型转换为HLS格式
config = config_from_keras_model(model, backend='VivadoAccelerator',
                                 default_precision='fixed<12,6>',
                                 #max_precision='fixed<32,16>',
                                 granularity='model')
config['Model']['ReuseFactor'] = 1
config['Model']['Strategy'] = 'Latency' # Latency/Resource/Unrolled

hls_model = convert_from_keras_model(model, hls_config=config,
                                     backend='VivadoAccelerator', io_type='io_stream',
                                     output_dir='hls4ml_prj/test', board='pynq-z2')
# 编译HLS模型
hls_model.compile()

# 使用 HLS 模型的 .predict 方法进行预测（返回的结果与 Keras 模型类似）
predictions = hls_model.predict(X_test)
# 转换预测结果，因为输出为每个类别的概率，我们需要取概率最大的类别作为预测结果
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(y_test, axis=1)
# 使用 sklearn 的 accuracy_score 计算准确率
accuracy = accuracy_score(true_classes, predicted_classes)
print(f'HLS模型的准确率: {accuracy * 100:.2f}%')



Interpreting Sequential
Topology:
Layer name: qconv1_input, layer type: InputLayer, input shapes: [[None, 28, 28, 1]], output shape: [None, 28, 28, 1]
Layer name: qconv1, layer type: QConv2D, input shapes: [[None, 28, 28, 1]], output shape: [None, 28, 28, 2]
Layer name: activation, layer type: Activation, input shapes: [[None, 28, 28, 2]], output shape: [None, 28, 28, 2]
Layer name: max_pooling2d, layer type: MaxPooling2D, input shapes: [[None, 28, 28, 2]], output shape: [None, 14, 14, 2]
Layer name: qconv2, layer type: QConv2D, input shapes: [[None, 14, 14, 2]], output shape: [None, 14, 14, 1]
Layer name: activation_1, layer type: Activation, input shapes: [[None, 14, 14, 1]], output shape: [None, 14, 14, 1]
Layer name: max_pooling2d_1, layer type: MaxPooling2D, input shapes: [[None, 14, 14, 1]], output shape: [None, 7, 7, 1]
Layer name: flatten, layer type: Reshape, input shapes: [[None, 7, 7, 1]], output shape: [None, 49]
Layer name: qdense1, layer type: QDense, input shapes: [[None

In [5]:
hls_model.build(csim=False, export=True, bitfile=True)


****** Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC v2020.1 (64-bit)
  **** SW Build 2902540 on Wed May 27 19:54:35 MDT 2020
  **** IP Build 2902112 on Wed May 27 22:43:36 MDT 2020
    ** Copyright 1986-2020 Xilinx, Inc. All Rights Reserved.

source /opt/Xilinx/Vivado/2020.1/scripts/vivado_hls/hls.tcl -notrace
INFO: [HLS 200-10] Running '/opt/Xilinx/Vivado/2020.1/bin/unwrapped/lnx64.o/vivado_hls'
INFO: [HLS 200-10] For user 'lxz' on host 'HonorX14' (Linux_x86_64 version 6.8.0-57-generic) on Thu Apr 10 09:48:33 CST 2025
INFO: [HLS 200-10] On os Ubuntu 22.04.5 LTS
INFO: [HLS 200-10] In directory '/media/lxz/KP200pro/FPGA/HLS4ML/minst/hls4ml_prj/test'
Sourcing Tcl script 'build_prj.tcl'
INFO: [HLS 200-10] Opening project '/media/lxz/KP200pro/FPGA/HLS4ML/minst/hls4ml_prj/test/myproject_prj'.
INFO: [HLS 200-10] Adding design file 'firmware/myproject_axi.cpp' to the project
INFO: [HLS 200-10] Adding design file 'firmware/myproject.cpp' to the project
INFO: [HLS 200-10] Addi

{'CSynthesisReport': {'TargetClockPeriod': '5.00',
  'EstimatedClockPeriod': '5.444',
  'BestLatency': '6273',
  'WorstLatency': '6273',
  'IntervalMin': '6274',
  'IntervalMax': '6274',
  'BRAM_18K': '18',
  'DSP': '10',
  'FF': '19193',
  'LUT': '25809',
  'URAM': '0',
  'AvailableBRAM_18K': '280',
  'AvailableDSP': '220',
  'AvailableFF': '106400',
  'AvailableLUT': '53200',
  'AvailableURAM': '0'},
 'TimingReport': {'WNS': 0.851,
  'TNS': 0.0,
  'WHS': 0.013,
  'THS': 0.0,
  'WPWS': 3.75,
  'TPWS': 0.0}}

In [6]:
def parse_resource_utilization(file_path):
    def extract_utilization(lines, start, end, targets):
        results = {}
        for line in lines[start:end]:
            parts = [p.strip() for p in line.strip().split('|')]
            if len(parts) >= 6 and parts[1] in targets:
                # 处理百分比符号并转换为浮点数
                util = parts[5].replace('%', '')
                results[parts[1]] = float(util)
        return results
    
    with open(file_path, 'r') as f:
        content = f.readlines()

    # 确保索引从 0开始计算（行号-1）
    lut_ff = extract_utilization(content, 30, 44, ['Slice LUTs', 'Slice Registers'])
    bram = extract_utilization(content, 100, 109, ['Block RAM Tile'])
    dsp = extract_utilization(content, 115, 121, ['DSPs'])

    return {
        'LUT': lut_ff.get('Slice LUTs'),
        'FF': lut_ff.get('Slice Registers'),
        'BRAM': bram.get('Block RAM Tile'),
        'DSP': dsp.get('DSPs')
    }

# 使用示例
file_path = 'hls4ml_prj/test/myproject_vivado_accelerator/project_1.runs/impl_1/design_1_wrapper_utilization_placed.rpt'
result = parse_resource_utilization(file_path)
print(result)

{'LUT': 27.47, 'FF': 19.21, 'BRAM': 9.64, 'DSP': 4.55}


In [7]:
def extract_power_values(file_path):
    # 打开文件并读取第 32 至 45 行
    with open(file_path, 'r') as file:
        lines = file.readlines()[31:45] # 注意 Python 的索引是从 0 开始的，所以 32 行对应索引 31
    
    # 初始化返回值
    total_on_chip_power = None
    dynamic_power = None
    device_static_power = None

    # 遍历每一行，提取所需的值
    for line in lines:
        if 'Total On-Chip Power (W)' in line:
            # 提取字符串并转为 float
            total_on_chip_power = float(line.split('|')[2].strip())
        elif 'Dynamic (W)' in line:
            dynamic_power = float(line.split('|')[2].strip())
        elif 'Device Static (W)' in line:
            device_static_power = float(line.split('|')[2].strip())

    return {
        '芯片总功耗(W)': total_on_chip_power,
        '动态功耗(W)': dynamic_power,
        '静态功耗(W)': device_static_power
    }

# 示例调用
file_path = 'hls4ml_prj/test/myproject_vivado_accelerator/project_1.runs/impl_1/design_1_wrapper_power_routed.rpt'
power_values = extract_power_values(file_path)
print(power_values)

{'芯片总功耗(W)': 1.541, '动态功耗(W)': 1.402, '静态功耗(W)': 0.139}


In [8]:
import shutil

source_bit = "hls4ml_prj/test/myproject_vivado_accelerator/project_1.runs/impl_1/design_1_wrapper.bit"
dest_bit = "pynq-z2/hls4ml_nn.bit"

source_hwh = "hls4ml_prj/test/myproject_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hw_handoff/design_1.hwh"
dest_hwh = "pynq-z2/hls4ml_nn.hwh"

source_driver = "hls4ml_prj/test/axi_stream_driver.py"
dest_driver = "pynq-z2/axi_stream_driver.py"

# 如果目标文件夹不存在，则创建
dest_dir = "pynq-z2"
if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)
    print(f"已创建目标文件夹: {dest_dir}")

# 复制 .bit 文件并重命名
try:
    shutil.copy(source_bit, dest_bit)
    print(f"成功复制 '{source_bit}' 到 '{dest_bit}'")
except Exception as e:
    print(f"复制 '{source_bit}' 到 '{dest_bit}' 时出错: {e}")

# 复制 .hwh 文件并重命名
try:
    shutil.copy(source_hwh, dest_hwh)
    print(f"成功复制 '{source_hwh}' 到 '{dest_hwh}'")
except Exception as e:
    print(f"复制 '{source_hwh}' 到 '{dest_hwh}' 时出错: {e}")

# 复制 axi_stream_driver 到目标文件夹
try:
    shutil.copy(source_driver, dest_driver)
    print(f"成功复制 '{source_driver}' 到 '{dest_driver}'")
except Exception as e:
    print(f"复制 '{source_driver}' 到 '{dest_driver}' 时出错: {e}")

成功复制 'hls4ml_prj/test/myproject_vivado_accelerator/project_1.runs/impl_1/design_1_wrapper.bit' 到 'pynq-z2/hls4ml_nn.bit'
成功复制 'hls4ml_prj/test/myproject_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hw_handoff/design_1.hwh' 到 'pynq-z2/hls4ml_nn.hwh'
成功复制 'hls4ml_prj/test/axi_stream_driver.py' 到 'pynq-z2/axi_stream_driver.py'


In [9]:
# 加载预测结果 (从 y_hw.npy 文件)
predictions = np.load("y_hw.npy")
# 提取预测类别
predicted_classes = np.argmax(predictions, axis=1)
# 提取真实类别
true_classes = np.argmax(y_test, axis=1)
# 使用 sklearn 的 accuracy_score 计算准确率
accuracy = accuracy_score(true_classes, predicted_classes)
print(f"硬件推理准确率: {accuracy * 100:.2f}%")

硬件推理准确率: 93.61%
