# Generate Landmarks on Histopathology dataset LBP features

In [1]:
from helper import *
from random import randrange
from skimage.feature import local_binary_pattern

In [2]:
SUBSET = '20k'
DATA_DIR = 'data/'
NUM_LANDMARKS = 64

In [3]:
DATA_FILENAME = 'lbp_subset'
(X_train, y_train), (_, _) = \
    load_npz(DATA_DIR + DATA_FILENAME + SUBSET + '.npz')

((20000, 10), dtype('int16')) ((20000,), dtype('uint8'))
((10000, 10), dtype('int16')) ((10000,), dtype('uint8'))


In [4]:
# Avoid Divide-by-Zero in KLD
X_train += 1

In [5]:
# Return smaller of either Kullback–Leibler Divergences
def kld(p, q):
    p_q = np.sum(p * np.log2(p/q))
    q_p = np.sum(q * np.log2(q/p))
    if p_q < 0 or q_p < 0:
        print(p); print(q); print(sum)
        raise Exception
    return min(p_q, q_p)

In [6]:
# Reduce <X> to <target_size> by eliminating similar features,
# using <distance> as a metric, via an iterative, stochastic
# sampling of <num_subsamples> and eliminating <num_eliminate>
# features at each iteration
def reduce(X, target_size, num_subsample, num_eliminate, distance):
    modulo = 200
    while (len_X := len(X)) > target_size:
        x1, x2 = (randrange(len_X), randrange(len_X))
        distances = [(0, distance(X[x1], X[x2]))]
        for _ in range(num_subsample):
            x1, x2 = (randrange(len_X), randrange(len_X))
            if x1 == x2: continue
            dist = distance(X[x1], X[x2])
            if dist < distances[-1][1]:
                distances.append((x1, dist))
        eliminate = [tup[0] for tup in distances[-num_eliminate:]]
        X = np.delete(X, eliminate, axis=0)
        modulo -= 1
        if (modulo == 0):
            print(len_X, end=' ')
            modulo = 200
    print()
    return X

### Positive-and-Negative Landmarks

In [7]:
landmarks = reduce(X_train, 512, 2000, 5, kld)
landmarks = reduce(landmarks, NUM_LANDMARKS, 2000, 1, kld)
landmarks.shape, landmarks.dtype

19026 18051 17074 16098 15115 14127 13154 12174 11183 10200 9216 8235 7243 6271 5293 4307 3329 2341 1351 
309 109 


((64, 10), dtype('int16'))

In [8]:
landmarks.min(), landmarks.max(), landmarks.mean()

(1, 7977, 922.600)

In [9]:
landmarks[:5]

array([[  96,  283,  192,  964, 1884, 2028,  561,  548, 2269,  401],
       [ 670,  584,  209,  342,  400,  941,  336,  653, 4010, 1081],
       [  14,  100,   26,  324,  957, 3219,  173,  173, 4035,  205],
       [  23,  218,   44,  616, 1075, 3698,  331,  401, 2423,  397],
       [  45,  219,  133, 1058, 2830, 2767,  664,  467,  743,  300]],
      dtype=int16)

### Positive/Negative-Only Landmarks

In [10]:
TRAIN_SAMPLES = len(X_train)
train_pos = []
train_neg = []
for x, y in zip(X_train, y_train):
    if y:
        train_pos.append(x)
    else:
        train_neg.append(x)
num_pos = len(train_pos)
num_neg = len(train_neg)
num_pos, num_neg

(8055, 11945)

In [11]:
X_pos = np.array(train_pos)
X_neg = np.array(train_neg)
(X_pos.shape, X_pos.dtype), (X_neg.shape, X_neg.dtype)

(((8055, 10), dtype('int16')), ((11945, 10), dtype('int16')))

In [12]:
landmarks_pos = reduce(X_pos, 512, 2000, 5, kld)
landmarks_pos = reduce(landmarks_pos, NUM_LANDMARKS, 2000, 1, kld)
landmarks_pos.shape, landmarks_pos.dtype

7084 6115 5135 4151 3173 2197 1207 
309 109 


((64, 10), dtype('int16'))

In [13]:
landmarks_pos.min(), landmarks_pos.max(), landmarks_pos.mean()

(1, 7217, 922.600)

In [14]:
landmarks_neg = reduce(X_neg, 512, 2000, 5, kld)
landmarks_neg = reduce(landmarks_neg, NUM_LANDMARKS, 2000, 1, kld)
landmarks_neg.shape, landmarks_neg.dtype

10968 9991 9013 8027 7041 6053 5072 4087 3101 2119 1143 
309 109 


((64, 10), dtype('int16'))

In [15]:
landmarks_neg.min(), landmarks_neg.max(), landmarks_neg.mean()

(1, 7923, 922.600)

In [16]:
LAND_FILENAME = 'landmarks_subset' + SUBSET
np.savez(DATA_DIR + LAND_FILENAME,
         both=landmarks, pos=landmarks_pos, neg=landmarks_neg)