In [2]:
import cupy as cp

from superfv.fv import interpolate_cell_centers
from superfv.fv_cuda import interpolate_central_quantity
from superfv.slope_limiting import compute_dmp_kernel_helper
from superfv.slope_limiting.shock_detection import (
    compute_shock_detector,
    compute_shocks_kernel_helper,
)
from superfv.slope_limiting.zhang_and_shu import (
    compute_theta,
    compute_theta_kernel_helper,
    ZhangShuConfig,
)

In [4]:
# COMPUTE THETA FOR ZHANG-SHU LIMITER
N = 1024

u = cp.ones((5, N, N, 1))
ucc = cp.ones((5, N, N, 1, 1))
ux = cp.ones((5, N, N, 1, 3))
uy = cp.ones((5, N, N, 1, 3))
theta = cp.empty((5, N, N, 1, 1))
M = cp.ones((5, N, N, 1))
m = cp.ones((5, N, N, 1))
Mj = cp.ones((5, N, N, 1))
mj = cp.ones((5, N, N, 1))
buffer = cp.empty((5, N, N, 1, 10))

zs_config = ZhangShuConfig(
    shock_detection=False,
    smooth_extrema_detection=False,
    check_uniformity=False,
    physical_admissibility_detection=False,
    include_corners=True,
)

In [5]:
# uses cp.ElementwiseKernel
_ = compute_theta(
    cp,
    u,
    ucc,
    ux,
    uy,
    None,
    out=theta,
    M=M,
    m=m,
    Mj=Mj,
    mj=mj,
    buffer=buffer,
    config=zs_config,
)

In [6]:
%%timeit
_ = compute_theta(
    cp,
    u,
    ucc,
    ux,
    uy,
    None,
    out=theta,
    M=M,
    m=m,
    Mj=Mj,
    mj=mj,
    buffer=buffer,
    config=zs_config,
)

49.3 ms ± 1.64 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
u_flat = cp.ones((5, N * N))
uj_flat = cp.ones((5, N * N, 7))
M_flat = cp.ones((5, N * N))
m_flat = cp.ones((5, N * N))
Mj_flat = cp.ones((5, N * N))
mj_flat = cp.ones((5, N * N))
theta_flat = cp.ones((5, N * N))

In [8]:
# uses cp.RawKernel
_ = compute_dmp_kernel_helper(u, M, m, True)
_ = compute_theta_kernel_helper(
    u_flat, uj_flat, M_flat, m_flat, Mj_flat, mj_flat, theta_flat, 1e-16
)

In [9]:
%%timeit
_ = compute_dmp_kernel_helper(u, M, m, True)
_ = compute_theta_kernel_helper(
    u_flat, uj_flat, M_flat, m_flat, Mj_flat, mj_flat, theta_flat, 1e-16
)

739 μs ± 4.13 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [25]:
buffer.flags.c_contiguous

True

In [26]:
buffer[..., 0].flags.c_contiguous

False

In [11]:
# COMPUTE SHOCK DETECTOR ETA
eta = cp.empty((5, N, N, 1, 3))
has_shock = cp.zeros((1, N, N, 1), dtype=cp.int32)

In [12]:
# uses cp.ElementwiseKernel
_ = compute_shock_detector(
    cp, u, u, ("x", "y"), 0.025, out=has_shock, eta=eta[..., 0], buffer=buffer
)

In [13]:
%%timeit
_ = compute_shock_detector(
    cp, u, u, ("x", "y"), 0.025, out=has_shock, eta=eta[..., 0], buffer=buffer
)

34.3 ms ± 6.16 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
# uses cp.RawKernel
_ = compute_shocks_kernel_helper(u, u, 0.025, 1e-16, eta=eta, has_shock=has_shock)

In [15]:
%%timeit
_ = compute_shocks_kernel_helper(u, u, 0.025, 1e-16, eta=eta, has_shock=has_shock)

304 μs ± 70.8 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
# INTERPOLATE CELL CENTERS
p = 7
uu = cp.ones((5, N, N, 1))

In [17]:
# uses cp.ElementwiseKernel
_ = interpolate_cell_centers(cp, u, ("x", "y"), p, out=ucc, buffer=buffer)

In [18]:
%%timeit
_ = interpolate_cell_centers(cp, u, ("x", "y"), p, out=ucc, buffer=buffer)

651 μs ± 175 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [22]:
# uses cp.RawKernel
_ = interpolate_central_quantity(
    u, ucc[..., 0], 0, p, ("x", "y"), uu=uu
)  # Using cp.RawKernel

In [23]:
%%timeit
_ = interpolate_central_quantity(
    u, ucc[..., 0], 0, p, ("x", "y"), uu=uu
)  # Using cp.RawKernel

299 μs ± 400 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
