In [7]:
import cupy as cp

def increment_by_one(a):
    a += 1
    return a

def increment_by_many(a,N):
    for _ in range(N):
        a = increment_by_one(a)
    return a

base = cp.float64(1.)
increment_by_one(base), increment_by_many(base, 10)

(2.0, 11.0)

In [2]:
base

1.0

In [19]:
import cupy as cp

cuda_src = r"""
extern "C" {

__device__ __forceinline__
double increment_by_one_core(double a) {
    return a + 1.0;
}

__device__ __forceinline__
double increment_by_many_core(double a, int N) {
    for (int i = 0; i < N; ++i) {
        a = increment_by_one_core(a);
    }
    return a;
}

__global__ void increment_by_one_kernel(double* a) {
    a[0] = increment_by_one_core(a[0]);
}

__global__ void increment_by_many_kernel(double* a, int N) {
    a[0] = increment_by_many_core(a[0], N);
}

} // extern "C"
"""

# Compile once
module = cp.RawModule(
    code=cuda_src,
    options=('-std=c++11',),
    name_expressions=[
        'increment_by_one_kernel',
        'increment_by_many_kernel',
    ],
)

increment_by_one_kernel = module.get_function('increment_by_one_kernel')
increment_by_many_kernel = module.get_function('increment_by_many_kernel')


# ---- Python helper wrappers ----

def increment_by_one_gpu(x):
    a = cp.asarray(x, dtype=cp.float64)
    increment_by_one_kernel((1,), (1,), (a,))
    return a

def increment_by_many_gpu(x, N):
    a = cp.asarray(x, dtype=cp.float64)
    increment_by_many_kernel((1,), (1,), (a, cp.int32(N)))
    return a


# Test
if __name__ == "__main__":
    base = cp.float64(1.0)

    print("one:",   increment_by_one_gpu(base).item())      # 2.0
    print("many:",  increment_by_many_gpu(base, 10).item()) # 11.0

one: 2.0
many: 11.0
