# speed test for `searchsorted`

In [None]:
import cupy as cp
import torch
from cupyx.profiler import benchmark
import torchsearchsorted

In [None]:
def torch_searchsorted(ref:cp.ndarray, # N-D cupy array, containing monotonically increasing sequence on the innermost dimension
                  sec:cp.ndarray, # N-D cupy array, containing the search value(s)
                 ) -> cp.ndarray: # Array of insertion points with the same shape as `secs`
    '''a simple `torch.searchsorted` wrapper for cupy array'''
    _ref = torch.as_tensor(ref)
    _sec = torch.as_tensor(sec)
    indices = torch.searchsorted(_ref,_sec,side='right')
    indices = cp.asarray(indices)
    return indices

In [None]:
def cupy_searchsorted(a,b):
    m,n = a.shape
    max_num = cp.maximum(a.max() - a.min(), b.max() - b.min()) + 1
    r = max_num*cp.arange(a.shape[0])[:,None]
    p = cp.searchsorted( (a+r).ravel(), (b+r).ravel(), side='right' ).reshape(m,-1)
    return p - n*(cp.arange(m)[:,None])

In [None]:
def stream_searchsorted(a,b):
    m,n = a.shape
    out = cp.empty_like(b,dtype=cp.int64)
    map_streams = []
    for i in range(m):
        map_streams.append(cp.cuda.stream.Stream(non_blocking=True))
    device = cp.cuda.Device()
    for i, stream in enumerate(map_streams):
        with stream:
            out[i,:] = cp.searchsorted(a[i,:],b[i,:],side='right')
    device.synchronize()
    return out

In [None]:
ref = cp.arange(20, dtype=cp.float32).reshape(4,5)
sec = cp.arange(-1,19, dtype=cp.float32).reshape(4,5)

In [None]:
torch_out = torch_searchsorted(ref,sec)
cupy_out = cupy_searchsorted(ref,sec)
stream_out = stream_searchsorted(ref,sec)

In [None]:
torch_out

array([[0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]])

In [None]:
cupy_out

array([[0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]])

In [None]:
stream_out

array([[0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]])

In [None]:
width = 100
nlines = 10000
ref = cp.arange(-1.1,-1.1+width*nlines, dtype=cp.float32).reshape(nlines,width)
sec = cp.arange(-1.5,-1.5+width*nlines, dtype=cp.float32).reshape(nlines,width)
print(benchmark(torch_searchsorted,(ref, sec), n_repeat=100))
print(benchmark(cupy_searchsorted,(ref, sec), n_repeat=100))
#print(benchmark(stream_searchsorted,(ref, sec), n_repeat=100))

torch_searchsorted  :    CPU: 8254.008 us   +/-373.205 (min: 8081.082 / max:10619.253) us     GPU-0: 8885.001 us   +/-372.529 (min: 8713.888 / max:11242.240) us
cupy_searchsorted   :    CPU:  359.469 us   +/- 6.071 (min:  348.583 / max:  375.733) us     GPU-0:  380.876 us   +/- 5.246 (min:  372.736 / max:  397.312) us


In [None]:
_ref = torch.as_tensor(ref)
_sec = torch.as_tensor(sec)
print(benchmark(torch.searchsorted,(_ref, _sec), n_repeat=100))
print(benchmark(torchsearchsorted.searchsorted,(_ref, _sec), n_repeat=100))

searchsorted        :    CPU: 5718.757 us   +/-322.460 (min: 5644.038 / max: 8415.207) us     GPU-0: 5726.684 us   +/-322.698 (min: 5651.424 / max: 8422.784) us
searchsorted        :    CPU:13999.385 us   +/-88.953 (min:13972.137 / max:14832.605) us     GPU-0:14007.932 us   +/-89.373 (min:13980.384 / max:14843.904) us


The best way is to write a new cuda kernel for it.