In [1]:
import cupy as cp

In [2]:
mykernel = cp.RawKernel(
    """
    extern "C" __global__
    void mykernel(
        const double* a,
        const double* b,
        double* out,
        const int nvars,
        const int nx,
        const int ninterps,
    ) {
        int tid = (int)(blockIdx.x * blockDim.x + threadIdx.x);
        int stride = (int)(blockDim.x * gridDim.x);
    
        int n = nvars * nx;
    
        for (int i = tid; i < n; i += stride) {
            const double* row = b + ((size_t)i) * (size_t)ninterps;
    
            double m = row[0];
            double M = row[0];
            for (int j = 1; j < ninterps; ++j) {
                double vj = row[j];
                m = (vj < m) ? vj : m;
                M = (vj > M) ? vj : M;
            }
    
            out[i] = a[i] + m;
        }
    }
    """,
    "mykernel",
)

In [3]:
nvars = 5
nx = 16
ninterps = 3

a = cp.ones((nvars, nx))
b = cp.ones((nvars, nx, ninterps))
out = cp.empty((nvars, nx))

n = nvars * nx
threads = 256
blocks = (ninterps + threads - 1) // threads
blocks = min(blocks, 65535)

In [4]:
mykernel((blocks,), (threads,), (a, b, out, nvars, nx, ninterps))
out

array([[2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
       [2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
       [2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
       [2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
       [2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.]])

In [27]:
compute_theta_kernel = cp.RawKernel(
    """
    extern "C" __global__
    void compute_theta_kernel(
        const double* w,
        const double* wj,
        const double* M,
        const double* m,
        double* out,
        const int nvars,
        const int nx,
        const int ninterps,
        const double eps
    ) {
        int tid = (int)(blockIdx.x * blockDim.x + threadIdx.x);
        int stride = (int)(blockDim.x * gridDim.x);
    
        int n = nvars * nx;
    
        for (int i = tid; i < n; i += stride) {
            const double* row = wj + ((size_t)i) * (size_t)ninterps;
    
            double mj = row[0];
            double Mj = row[0];
            for (int j = 1; j < ninterps; ++j) {
                double vj = row[j];
                mj = (vj < mj) ? vj : mj;
                Mj = (vj > Mj) ? vj : Mj;
            }

            out[i] = 1.0;
            out[i] = fmin(fabs(M[i] - w[i]) / (fabs(Mj - w[i]) + eps), out[i]);
            out[i] = fmin(fabs(m[i] - w[i]) / (fabs(mj - w[i]) + eps), out[i]);
        }
    }
    """,
    "compute_theta_kernel",
)

In [30]:
nvars = 5
nx = 16
ninterps = 3

w = cp.ones((nvars, nx))
wj = cp.ones((nvars, nx, ninterps))
M = cp.ones((nvars, nx)) * 1.01
m = cp.ones((nvars, nx)) * 0.99
out = cp.empty((nvars, nx))

n = nvars * nx
threads = 256
blocks = (ninterps + threads - 1) // threads
blocks = min(blocks, 65535)

In [31]:
compute_theta_kernel(
    (blocks,), (threads,), (w, wj, M, m, out, nvars, nx, ninterps, 1e-15)
)
out

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])