# SHP

> Spatially Homogenious Pixels Selection

In [None]:
#| default_exp shp

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#| hide
# import for test
from cupyx.profiler import benchmark
from scipy import stats
import numpy as np

In [None]:
#| export
import cupy as cp
#from cupy._sorting.search import _searchsorted_kernel as searchsorted

## Kolmogorov-Smirnov (KS) two-sample test

In [None]:
#| export
def searchsorted(ref:cp.ndarray, # multi dimentional array
                   sec:cp.ndarray, # multi dimentional array
                  )->cp.ndarray: # multi dimentional index array, dtype: cp.int
    innerest_dim = ref.shape[-1]
    outer_dim = ref.shape[:-1]
    max_num = cp.maximum(ref.max()-ref.min(),sec.max()-sec.min())+1
    multiple = cp.expand_dims(cp.arange(ref.size/innerest_dim,dtype=int).reshape(*outer_dim),axis=-1)
    added = max_num*multiple
    # test have show that ref.max() is faster than ref[:,-1].max() when ref.shape[1] is less than 100
    p = cp.searchsorted((ref+added).ravel(),(sec+added).ravel(),side='right').reshape(*sec.shape)
    return p - innerest_dim*(multiple)

Find the indices from the innermost dimension of `ref` such that, if the corresponding values in `sec` were inserted before the indices. `ref.shape` must equals to `sec.shape`. For example:

In [None]:
ref = cp.arange(20, dtype=cp.float32)
sec = cp.arange(5.5,25.5, dtype=cp.float32)
out = searchsorted2d(ref,sec)

In [None]:
ref = cp.arange(20, dtype=cp.float32).reshape(2,2,5)
sec = cp.arange(5.5,15.5, dtype=cp.float32).reshape(2,1,5)
sec = cp.tile(sec,(1,2,2))

In [None]:
ref

array([[[ 0.,  1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.,  9.]],

       [[10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19.]]], dtype=float32)

In [None]:
sec

array([[[ 5.5,  6.5,  7.5,  8.5,  9.5,  5.5,  6.5,  7.5,  8.5,  9.5],
        [ 5.5,  6.5,  7.5,  8.5,  9.5,  5.5,  6.5,  7.5,  8.5,  9.5]],

       [[10.5, 11.5, 12.5, 13.5, 14.5, 10.5, 11.5, 12.5, 13.5, 14.5],
        [10.5, 11.5, 12.5, 13.5, 14.5, 10.5, 11.5, 12.5, 13.5, 14.5]]],
      dtype=float32)

In [None]:
out = searchsorted2d(ref,sec)
out

array([[[5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
        [1, 2, 3, 4, 5, 1, 2, 3, 4, 5]],

       [[1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]])

In [None]:
#| hidden
# test
assert (out[0,1,:5] == cp.array([1, 2, 3, 4, 5],dtype=cp.int64)).all()

In [None]:
#| hidden
# test the speed of cp.max()
ref = cp.arange(20000, dtype=cp.float32).reshape(200,100)
print(benchmark(cp.max,(ref,), n_repeat=1000))
print(benchmark(cp.max,(ref[:,-1],), n_repeat=1000))

amax                :    CPU:   18.950 us   +/- 1.750 (min:   18.280 / max:   71.205) us     GPU-0:   23.514 us   +/- 1.772 (min:   22.528 / max:   74.752) us
amax                :    CPU:   17.984 us   +/- 0.606 (min:   17.390 / max:   26.476) us     GPU-0:   21.877 us   +/- 0.895 (min:   20.480 / max:   36.864) us


In [None]:
#| hidden
# test the speed between cupy and torch
# def torch_searchsorted(ref,sec):
#     _ref = torch.as_tensor(ref)
#     _sec = torch.as_tensor(sec)
#     indices = torch.searchsorted(_ref,_sec,side='right')
#     indices = cp.asarray(indices)
#     return indices
# width = 100
# nlines = 10000
# ref = cp.arange(-1.1,-1.1+width*nlines, dtype=cp.float32).reshape(nlines,width)
# sec = cp.arange(-1.5,-1.5+width*nlines, dtype=cp.float32).reshape(nlines,width)
# print(benchmark(torch_searchsorted,(ref, sec), n_repeat=100))
# print(benchmark(searchsorted,(ref, sec), n_repeat=100))

In [None]:
#| export
def ecdf_distance(data1:cp.ndarray, # data set 1
                  data2, # data set 2
                  n:int=0, # number of dimensions to compare
                 ):
    n = data1.shape[-1]
    data_all = cp.concatenate((data1,data2),axis=-1)
    cdf1 = searchsorted(data1,data_all)/n
    cdf2 = searchsorted(data2,data_all)/n
    cddiffs = cp.abs(cdf1 - cdf2)
    distance = cp.max(cddiffs,axis=-1)
    return distance

In [None]:
ref = cp.arange(20, dtype=cp.float32).reshape(2,2,5)
sec = cp.arange(-1,19, dtype=cp.float32).reshape(2,2,5)

In [None]:
ref

array([[[ 0.,  1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.,  9.]],

       [[10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19.]]], dtype=float32)

In [None]:
sec

array([[[-1.,  0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.,  8.]],

       [[ 9., 10., 11., 12., 13.],
        [14., 15., 16., 17., 18.]]], dtype=float32)

In [None]:
ecdf_distance(ref,sec)

array([[0.2, 0.2],
       [0.2, 0.2]])

In [None]:
#| hide
# test
rng = np.random.default_rng()
sample1 = stats.uniform.rvs(size=5, random_state=rng)
sample2 = stats.norm.rvs(size=5, random_state=rng)
sample1 = np.sort(sample1)
sample2 = np.sort(sample2)
dist_scipy,p = stats.ks_2samp(sample1, sample2)
dist_cupy = ecdf_distance(cp.asarray(sample1),cp.asarray(sample2))
assert abs(dist_scipy-float(dist_cupy))<1e-7

In [None]:
#| export
def ks_2sam(data1:cp.ndarray, # samples to be test, each innermost vector is samples from one distribution
            data2:cp.ndarray, # the seconds stack of samples,
            alpha:float=0.9, # significant value, the bigger the stricter in selecting SHP, between 0 and 1
           ) -> cp.array: # stack of bool, if SHP or not
    '''GPU version of ks 2 sample test'''
    # null: same distribution
    distance = ecdf_distance(data1,data2)
    from scipy.stats import kstwo
    critical_distance = kstwo.ppf(1-alpha,data1.shape[-1])
    return distance < critical_distance

In [None]:
# a test is needed here

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()