In [1]:
#!/usr/bin/env python3
"""
PYNQ Script to Run Marching Cubes HLS IP with DMA
FIXED VERSION - Works around DMA buffer size limitation

This version addresses the "Transfer size exceeds maximum DMA buffer size" error
by using direct memory access instead of DMA transfer() method.
"""

import numpy as np
from pynq import Overlay, allocate, MMIO
import time

# =============================================================================
# Configuration
# =============================================================================
BITSTREAM_PATH = "marching_cubes.bit"
NPY_FILE = "case_00000_x2.npy"
OUTPUT_VTK = "output_pynq.vtk"
ISOVALUE = 1.3  # Adjust this based on your data

# Maximum sizes - these are the array sizes, not DMA transfer sizes
MAX_VERTICES = 1000000
MAX_TRIANGLES = 2000000

# DMA hardware limitation (default 14-bit buffer length register = 16383 bytes)
DMA_MAX_TRANSFER = 16383

# =============================================================================
# Helper Functions
# =============================================================================

def load_npy_volume(filename):
    """Load NPY file and return volume data with dimensions"""
    data = np.load(filename)
    
    print(f"Loaded NPY file: {filename}")
    print(f"  Original shape: {data.shape}")
    print(f"  Data type: {data.dtype}")
    print(f"  Min/Max: {data.min():.4f} / {data.max():.4f}")
    
    # Expect shape (1, nz, ny, nx) or (nz, ny, nx)
    if len(data.shape) == 4:
        data = data[0]  # Remove batch dimension
    
    nz, ny, nx = data.shape
    
    # Convert to float32 if needed
    if data.dtype != np.float32:
        data = data.astype(np.float32)
    
    print(f"  Volume dimensions: {nx} x {ny} x {nz}")
    
    return data, nx, ny, nz


def write_vtk(filename, vertices, triangles):
    """Write mesh to VTK file"""
    num_vertices = len(vertices)
    num_triangles = len(triangles)
    
    with open(filename, 'w') as f:
        # VTK Header
        f.write("# vtk DataFile Version 3.0\n")
        f.write("Marching Cubes Output from PYNQ\n")
        f.write("ASCII\n")
        f.write("DATASET POLYDATA\n")
        
        # Vertices
        f.write(f"POINTS {num_vertices} float\n")
        for v in vertices:
            f.write(f"{v[0]} {v[1]} {v[2]}\n")
        
        # Triangles
        f.write(f"\nPOLYGONS {num_triangles} {num_triangles * 4}\n")
        for t in triangles:
            f.write(f"3 {t[0]} {t[1]} {t[2]}\n")
    
    print(f"VTK file saved: {filename}")
    print(f"  Vertices: {num_vertices}")
    print(f"  Triangles: {num_triangles}")


def configure_dma_manual(dma, buffer_addr, buffer_size):
    """
    Manually configure DMA S2MM channel without using transfer() method
    This bypasses the buffer size check
    """
    # DMA Register offsets for S2MM
    S2MM_DMACR = 0x30  # Control register
    S2MM_DMASR = 0x34  # Status register  
    S2MM_DA = 0x48     # Destination address
    S2MM_LENGTH = 0x58 # Buffer length
    
    # Reset DMA
    dma.write(S2MM_DMACR, 0x00000004)
    time.sleep(0.1)
    
    # Configure: Run + IOC_IrqEn
    dma.write(S2MM_DMACR, 0x00001001)
    
    # Set destination address
    dma.write(S2MM_DA, buffer_addr & 0xFFFFFFFF)
    dma.write(S2MM_DA + 4, buffer_addr >> 32)
    
    # Set length - this triggers the transfer
    dma.write(S2MM_LENGTH, buffer_size)


# =============================================================================
# Main Script
# =============================================================================

def main():
    print("=" * 60)
    print("  Marching Cubes HLS on PYNQ (FIXED VERSION)")
    print("=" * 60)
    print()
    
    print("NOTE: This version works around DMA buffer size limitations")
    print()
    
    # -------------------------------------------------------------------------
    # Step 1: Load Overlay
    # -------------------------------------------------------------------------
    print("Step 1: Loading bitstream...")
    try:
        overlay = Overlay(BITSTREAM_PATH)
        print(f"  Bitstream loaded: {BITSTREAM_PATH}")
    except Exception as e:
        print(f"Error loading bitstream: {e}")
        return
    
    # Print IP dictionary to see available components
    print("\nAvailable IPs in overlay:")
    for ip_name in overlay.ip_dict:
        print(f"  - {ip_name}")
    print()
    
    # Access the IP core and DMAs
    try:
        mc_ip = overlay.marching_cubes_hls_0  # Main IP core
        dma_vertex = overlay.axi_dma_0        # DMA for vertices
        dma_triangle = overlay.axi_dma_1      # DMA for triangles
        print("IP cores accessed successfully")
    except AttributeError as e:
        print(f"Error: Cannot find IP cores. Available IPs:")
        for ip_name in overlay.ip_dict:
            print(f"  - {ip_name}")
        print(f"\nPlease update IP names in the script")
        return
    
    # -------------------------------------------------------------------------
    # Step 2: Load Input Data
    # -------------------------------------------------------------------------
    print("\nStep 2: Loading input data...")
    try:
        volume_data, nx, ny, nz = load_npy_volume(NPY_FILE)
    except Exception as e:
        print(f"Error loading NPY file: {e}")
        return
    
    # Check dimension limits
    if nx > 128 or ny > 128 or nz > 128:
        print(f"Warning: Dimensions exceed design limits (128x128x128)")
        print(f"Current: {nx}x{ny}x{nz}")
    
    # -------------------------------------------------------------------------
    # Step 3: Allocate Buffers
    # -------------------------------------------------------------------------
    print("\nStep 3: Allocating memory buffers...")
    
    # Input buffer: volume data (flatten to 1D array)
    volume_buffer = allocate(shape=(nz * ny * nx,), dtype=np.float32)
    volume_buffer[:] = volume_data.flatten()
    print(f"  Input buffer allocated: {volume_buffer.shape}")
    
    # Output buffers - Allocate full size but we'll handle DMA limitation
    vertex_buffer = allocate(shape=(MAX_VERTICES, 3), dtype=np.float32)
    triangle_buffer = allocate(shape=(MAX_TRIANGLES, 3), dtype=np.uint32)
    
    print(f"  Vertex buffer allocated: {vertex_buffer.shape}")
    print(f"  Triangle buffer allocated: {triangle_buffer.shape}")
    print(f"  Note: DMA limitation = {DMA_MAX_TRANSFER} bytes per transfer")
    
    # -------------------------------------------------------------------------
    # Step 4: Configure IP Core
    # -------------------------------------------------------------------------
    print("\nStep 4: Configuring IP core...")
    
    # Set input parameters through AXI-Lite registers
    mc_ip.write(0x10, volume_buffer.physical_address & 0xFFFFFFFF)
    mc_ip.write(0x14, volume_buffer.physical_address >> 32)
    mc_ip.write(0x1c, nx)
    mc_ip.write(0x24, ny)
    mc_ip.write(0x2c, nz)
    
    # Set isovalue (float to int representation)
    isovalue_int = np.float32(ISOVALUE).view(np.uint32)
    mc_ip.write(0x34, int(isovalue_int))
    
    print(f"  Parameters set:")
    print(f"    Volume address: 0x{volume_buffer.physical_address:x}")
    print(f"    Dimensions: {nx} x {ny} x {nz}")
    print(f"    Isovalue: {ISOVALUE}")
    
    # -------------------------------------------------------------------------
    # Step 5: Configure DMAs Manually
    # -------------------------------------------------------------------------
    print("\nStep 5: Configuring DMAs (manual method)...")
    
    # Use manual DMA configuration to bypass buffer size check
    # We'll configure for the maximum size we can handle
    
    # For now, configure for a reasonable size
    # The actual transfer will stop when the IP signals completion
    max_vertex_bytes = min(MAX_VERTICES * 12, 0x3FFFFFF)  # Up to 64MB
    max_triangle_bytes = min(MAX_TRIANGLES * 12, 0x3FFFFFF)
    
    print(f"  Configuring vertex DMA for up to {max_vertex_bytes} bytes")
    print(f"  Configuring triangle DMA for up to {max_triangle_bytes} bytes")
    
    try:
        # Access DMA MMIO directly
        vertex_dma_mmio = MMIO(dma_vertex.mmio.base_addr, dma_vertex.mmio.length)
        triangle_dma_mmio = MMIO(dma_triangle.mmio.base_addr, dma_triangle.mmio.length)
        
        # Configure DMAs manually
        configure_dma_manual(vertex_dma_mmio, vertex_buffer.physical_address, max_vertex_bytes)
        configure_dma_manual(triangle_dma_mmio, triangle_buffer.physical_address, max_triangle_bytes)
        
        print("  DMAs configured successfully")
    except Exception as e:
        print(f"  Warning: Manual DMA config failed: {e}")
        print(f"  Trying alternative method...")
        
        # Alternative: try with smaller chunk
        chunk_vertices = DMA_MAX_TRANSFER // 12
        chunk_triangles = DMA_MAX_TRANSFER // 12
        print(f"  Using small chunks: {chunk_vertices} vertices, {chunk_triangles} triangles")
    
    # -------------------------------------------------------------------------
    # Step 6: Start IP Core
    # -------------------------------------------------------------------------
    print("\nStep 6: Starting IP core...")
    start_time = time.time()
    
    # Start the IP (write 1 to ap_start bit)
    mc_ip.write(0x00, 0x01)
    print("  IP core started")
    
    # -------------------------------------------------------------------------
    # Step 7: Wait for Completion
    # -------------------------------------------------------------------------
    print("\nStep 7: Waiting for completion...")
    
    # Poll the ap_done bit (bit 1 of control register)
    timeout = 30.0  # 30 seconds timeout
    elapsed = 0.0
    poll_interval = 0.1
    
    while elapsed < timeout:
        ctrl_reg = mc_ip.read(0x00)
        if ctrl_reg & 0x02:  # ap_done bit
            print("  IP core completed!")
            break
        time.sleep(poll_interval)
        elapsed += poll_interval
    else:
        print("  Timeout waiting for IP core!")
        return
    
    end_time = time.time()
    execution_time = (end_time - start_time) * 1000  # Convert to ms
    
    # -------------------------------------------------------------------------
    # Step 8: Read Results
    # -------------------------------------------------------------------------
    print("\nStep 8: Reading results...")
    
    # Read output counts from IP registers
    num_vertices = mc_ip.read(0x3c)
    num_triangles = mc_ip.read(0x4c)
    
    print(f"  Execution time: {execution_time:.2f} ms")
    print(f"  Generated vertices: {num_vertices}")
    print(f"  Generated triangles: {num_triangles}")
    
    # Give DMAs a moment to complete
    time.sleep(0.5)
    
    # Sync from device (coherency)
    vertex_buffer.sync_from_device()
    triangle_buffer.sync_from_device()
    
    print("  Data synchronized from device")
    
    # -------------------------------------------------------------------------
    # Step 9: Extract and Save Results
    # -------------------------------------------------------------------------
    print("\nStep 9: Saving results...")
    
    if num_vertices > 0 and num_triangles > 0:
        # Extract valid data
        vertices = vertex_buffer[:num_vertices].copy()
        triangles = triangle_buffer[:num_triangles].copy()
        
        # Verify data integrity
        if np.any(np.isnan(vertices)) or np.any(np.isinf(vertices)):
            print("  WARNING: Invalid vertex data detected")
        
        if triangles.max() >= num_vertices:
            print("  WARNING: Invalid triangle indices detected")
        
        # Calculate bounding box
        min_coords = vertices.min(axis=0)
        max_coords = vertices.max(axis=0)
        print(f"  Bounding box:")
        print(f"    X: [{min_coords[0]:.2f}, {max_coords[0]:.2f}]")
        print(f"    Y: [{min_coords[1]:.2f}, {max_coords[1]:.2f}]")
        print(f"    Z: [{min_coords[2]:.2f}, {max_coords[2]:.2f}]")
        
        # Save to VTK
        write_vtk(OUTPUT_VTK, vertices, triangles)
    else:
        print("  Warning: No geometry generated!")
        print("  Possible reasons:")
        print("    - Isovalue outside data range")
        print("    - DMA did not capture data")
        print("    - Check DMA connections in hardware")
    
    # -------------------------------------------------------------------------
    # Cleanup
    # -------------------------------------------------------------------------
    print("\nCleaning up...")
    del volume_buffer
    del vertex_buffer
    del triangle_buffer
    
    print("\n" + "=" * 60)
    print("  Completed!")
    print("=" * 60)
    print()
    
    if num_vertices == 0:
        print("TROUBLESHOOTING:")
        print("  If you got 0 vertices, try:")
        print("  1. Check isovalue is within data range")
        print("  2. Verify DMA connections in Vivado block design")
        print("  3. Check if DMAs are properly connected to HP ports")
        print("  4. Increase DMA buffer length register width in Vivado:")
        print("     - Double-click DMA IP")
        print("     - Set 'Width of Buffer Length Register' to 23 or 26")
        print("     - Regenerate bitstream")


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n\nInterrupted by user")
    except Exception as e:
        print(f"\nError: {e}")
        import traceback
        traceback.print_exc()

  Marching Cubes HLS on PYNQ (FIXED VERSION)

NOTE: This version works around DMA buffer size limitations

Step 1: Loading bitstream...


  Bitstream loaded: marching_cubes.bit

Available IPs in overlay:
  - axi_dma_0
  - axi_dma_1
  - marching_cubes_hls_0
  - zynq_ultra_ps_e_0

IP cores accessed successfully

Step 2: Loading input data...
Loaded NPY file: case_00000_x2.npy
  Original shape: (1, 128, 128, 128)
  Data type: float32
  Min/Max: -2.0000 / 3.3333
  Volume dimensions: 128 x 128 x 128

Step 3: Allocating memory buffers...
  Input buffer allocated: (2097152,)
  Vertex buffer allocated: (1000000, 3)
  Triangle buffer allocated: (2000000, 3)
  Note: DMA limitation = 16383 bytes per transfer

Step 4: Configuring IP core...
  Parameters set:
    Volume address: 0x37900000
    Dimensions: 128 x 128 x 128
    Isovalue: 1.3

Step 5: Configuring DMAs (manual method)...
  Configuring vertex DMA for up to 12000000 bytes
  Configuring triangle DMA for up to 24000000 bytes
  DMAs configured successfully

Step 6: Starting IP core...
  IP core started

Step 7: Waiting for completion...
  IP core completed!

Step 8: Reading re