In [2]:
import pyopencl as cl
import numpy as np

# 设置平台和设备（自动选择第一个设备）
platform = cl.get_platforms()[0]
device = platform.get_devices()[0]

# 创建上下文和命令队列
context = cl.Context([device])
queue = cl.CommandQueue(context)

# OpenCL 内核代码
kernel_code = """
__kernel void print_id()
{
    int global_id_x = get_global_id(0);
    int local_id_x = get_local_id(0);
    int local_id_y = get_local_id(1);
    int local_id_z = get_local_id(2);
    printf("global_id_x = %d; local_id_x = %d; local_id_y = %d; local_id_z = %d;\\n", 
           global_id_x, local_id_x, local_id_y, local_id_z);
}
"""

# 编译内核代码
program = cl.Program(context, kernel_code).build()

# 定义 global 和 local work size
global_size = (12, 3, 1)  # 相当于 CUDA 中的 grid=(2,1,1) 和 block=(6,3,1) 的组合
local_size = (6, 3, 1)    # 对应 block 的线程配置

# 调用 OpenCL 内核
program.print_id(queue, global_size, local_size)

# 完成队列中任务的执行
queue.finish()

global_id_x = 0; local_id_x = 0; local_id_y = 0; local_id_z = 0;
global_id_x = 1; local_id_x = 1; local_id_y = 0; local_id_z = 0;
global_id_x = 2; local_id_x = 2; local_id_y = 0; local_id_z = 0;
global_id_x = 3; local_id_x = 3; local_id_y = 0; local_id_z = 0;
global_id_x = 4; local_id_x = 4; local_id_y = 0; local_id_z = 0;
global_id_x = 5; local_id_x = 5; local_id_y = 0; local_id_z = 0;
global_id_x = 0; local_id_x = 0; local_id_y = 1; local_id_z = 0;
global_id_x = 1; local_id_x = 1; local_id_y = 1; local_id_z = 0;
global_id_x = 2; local_id_x = 2; local_id_y = 1; local_id_z = 0;
global_id_x = 3; local_id_x = 3; local_id_y = 1; local_id_z = 0;
global_id_x = 4; local_id_x = 4; local_id_y = 1; local_id_z = 0;
global_id_x = 5; local_id_x = 5; local_id_y = 1; local_id_z = 0;
global_id_x = 0; local_id_x = 0; local_id_y = 2; local_id_z = 0;
global_id_x = 1; local_id_x = 1; local_id_y = 2; local_id_z = 0;
global_id_x = 2; local_id_x = 2; local_id_y = 2; local_id_z = 0;
global_id_x = 3; local_id