In [1]:
import numpy as np
import pyopencl as cl

  warn("Unable to import recommended hash 'siphash24.siphash13', "


In [2]:
# OpenCL Kernel
kernel_code = '''
__kernel void convolutionGPU(
    __global float* output, __global float* input, __global float* filter, int filter_radius, int input_width, int input_height){
        int col = get_global_id(1);
        int row = get_global_id(0);
        int idx = row + input_width * col;

        int filter_width = 2 * filter_radius + 1;
        if (row < input_width && col < input_height) { // 边界检查
            float sum = 0;
            float value = 0;
            for (int i = -filter_radius; i  <= filter_radius; i++){
                for (int j = -filter_radius; j <= filter_radius; j++) {
                    int r = row + j;
                    int c = col + j;
                    if ( r < 0 || c < 0 || r >= input_width || c >= input_height) {
                        value = 0;
                    } else {
                        int input_idx = r + c * input_width;
                        value = input[input_idx];
                    }
                    int filter_idx = (i + filter_radius) + (j + filter_radius) * filter_width;
                    sum += value * filter[filter_idx];
                }
            }
            output[idx] = sum;
        }
    }
'''

In [8]:
def convolution_opencl(input, filter):
    # OpenCL setup
    platform = cl.get_platforms()[0]
    device = platform.get_devices()[0]
    context = cl.Context([device])
    queue = cl.CommandQueue(context)

    # Prepare data
    filter_radius = np.int32(filter.shape[0] // 2)
    input = np.float32(input)
    filter = np.float32(filter)
    output = np.empty_like(input)

    input_width = np.int32(input.shape[1])
    input_height = np.int32(input.shape[0])
    # filter_width = np.int32(filter.shape[0])

    # Allocate device memory
    mf = cl.mem_flags
    input_buf = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=input)
    filter_buf = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=filter)
    output_buf = cl.Buffer(context, mf.WRITE_ONLY, output.nbytes)

    # Compile kernel
    program = cl.Program(context, kernel_code).build()

    # Execute kernel
    global_size = (input_width, input_height)
    local_size = None # Let OpenCL determine the best local size

    program.convolutionGPU(queue, global_size, local_size, output_buf, input_buf, filter_buf, filter_radius, input_width, input_height)

    # Copy result back to host
    cl.enqueue_copy(queue, output, output_buf).wait()

    return output

In [9]:
def test_convolution_opencl():
    input = np.array([[1,1,1,0,0], [0,1,1,1,0], [0,0,1,1,1], [0,0,1,1,0], [0,1,1,0,0]], dtype=np.float32)

    print('Input:')
    print(input)

    filter = np.array([[1,0,1], [0,1,0], [1,0,1]], dtype=np.float32)

    output = convolution_opencl(input, filter)
    print('Done running the convolution kernel!')
    print('Output:')
    print(output)


In [10]:
test_convolution_opencl()

Input:
[[1. 1. 1. 0. 0.]
 [0. 1. 1. 1. 0.]
 [0. 0. 1. 1. 1.]
 [0. 0. 1. 1. 0.]
 [0. 1. 1. 0. 0.]]
Done running the convolution kernel!
Output:
[[3. 3. 3. 0. 0.]
 [0. 5. 5. 5. 0.]
 [0. 2. 5. 3. 3.]
 [2. 2. 1. 3. 2.]
 [0. 1. 1. 2. 2.]]
