In [32]:
import numpy as np
import time
import os
print(os.getcwd())

# Load test cases for FPGA validation
loaded_test_cases = np.load("../test_cases.npy", allow_pickle=True)
loaded_golden_results = np.load("../golden_results.npy", allow_pickle=True)

# Validate on FPGA
from pynq import Overlay, allocate

# Load the FPGA bitstream
overlay_baseline = Overlay("matmult_optimized.bit")
baseline = overlay_baseline.matmul_optimized_0

/root/jupyter_notebooks/getting_started/matmul_optimized_64


In [33]:
for idx, (A, B) in enumerate(loaded_test_cases):
    # Flatten matrices for FPGA input
    A_flat = A.flatten()
    B_flat = B.flatten()
    
    # Allocate buffers for 64x64 matrices
    input1_buffer = allocate(shape=(64*64,), dtype='f4')
    input2_buffer = allocate(shape=(64*64,), dtype='f4')
    output_buffer = allocate(shape=(64*64,), dtype='f4')
    
    # Copy test case data to buffers
    np.copyto(input1_buffer, A_flat)
    np.copyto(input2_buffer, B_flat)
    
    # Sync buffers to device
    input1_buffer.sync_to_device()
    input2_buffer.sync_to_device()
    print("input1_buffer:", input1_buffer)
    print("input2_buffer:", input2_buffer)
     # Write buffer addresses to FPGA (both low and high parts)
    baseline.write(baseline.register_map.A_1.address, input1_buffer.physical_address & 0xFFFFFFFF)
    baseline.write(baseline.register_map.A_2.address, (input1_buffer.physical_address >> 32) & 0xFFFFFFFF)
    baseline.write(baseline.register_map.B_1.address, input2_buffer.physical_address & 0xFFFFFFFF)
    baseline.write(baseline.register_map.B_2.address, (input2_buffer.physical_address >> 32) & 0xFFFFFFFF)
    baseline.write(baseline.register_map.Out_r_1.address, output_buffer.physical_address & 0xFFFFFFFF)
    baseline.write(baseline.register_map.Out_r_2.address, (output_buffer.physical_address >> 32) & 0xFFFFFFFF)
    
    # Start the FPGA computation
    baseline.write(0x00, 0x01)
    
    # Start timer
    start_time = time.time()
    
    # Wait for completion
    while (baseline.read(0x00) & 0x2) == 0:  # Wait for the AP_DONE bit
        pass
    
    end_time = time.time()
    baseline_time = end_time - start_time
    print(f"HW mul (baseline) exe time: {baseline_time:.6f}s")
    
    # Sync result buffer from device
    output_buffer.sync_from_device()
    
    print("output_buffer:", output_buffer)
    
    # Reshape FPGA output to 64x64
    fpga_result = output_buffer.reshape((64, 64))
    print("Output Matrix:", fpga_result)
    
    # Compare FPGA result with golden result
    golden_result = loaded_golden_results[idx]
    diff = np.abs(fpga_result - golden_result)
    max_diff = np.max(diff)
    print(f"Test Case {idx + 1}: Max Difference = {max_diff}")
    
    # Free buffers
    input1_buffer.freebuffer()
    input2_buffer.freebuffer()
    output_buffer.freebuffer()

print("Validation complete.")


input1_buffer: [0.81966764 0.1675325  0.04471349 ... 0.4347821  0.74261725 0.9171499 ]
input2_buffer: [0.8811151  0.4946358  0.4713695  ... 0.20226885 0.82889223 0.3881654 ]
HW mul (baseline) exe time: 0.000025s
output_buffer: [3.5277185 2.6486468 3.8880181 ... 0.        0.        0.       ]
Output Matrix: [[3.5277185 2.6486468 3.8880181 ... 4.1817803 4.6134257 4.15673  ]
 [4.687649  3.0642028 4.715063  ... 3.8831003 3.6834211 3.0707989]
 [5.1859684 4.358172  4.946903  ... 3.9902368 3.4120994 3.9436033]
 ...
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]]
Test Case 1: Max Difference = 23.85724639892578
input1_buffer: [0.14156799 0.97300345 0.8477219  ... 0.5446532  0.91756237 0.26658309]
input2_buffer: [0.58535784 0.39883885 0.91090226 ... 0.57346183 0.8548178  0.41129366]
HW mul (baseline) exe time: 0.000023s
output_buffer: [3.010264

In [34]:
baseline.register_map

RegisterMap {
  CTRL = Register(AP_START=0, AP_DONE=0, AP_IDLE=1, AP_READY=0, RESERVED_1=0, AUTO_RESTART=0, RESERVED_2=0, INTERRUPT=0, RESERVED_3=0),
  GIER = Register(Enable=0, RESERVED=0),
  IP_IER = Register(CHAN0_INT_EN=0, CHAN1_INT_EN=0, RESERVED_0=0),
  IP_ISR = Register(CHAN0_INT_ST=0, CHAN1_INT_ST=0, RESERVED_0=0),
  A_1 = Register(A=write-only),
  A_2 = Register(A=write-only),
  B_1 = Register(B=write-only),
  B_2 = Register(B=write-only),
  Out_r_1 = Register(Out_r=write-only),
  Out_r_2 = Register(Out_r=write-only)
}