In [1]:
import matplotlib.pyplot as plt
import numpy as np
from random import randrange
from skimage.feature import local_binary_pattern

In [2]:
DATA_DIR = 'data/'
DATA_FILENAME = 'data_naive1d_subset20k'
data_train = np.load(DATA_DIR + DATA_FILENAME + '_train' + '.npz')
X_train = data_train['X']
data_train.close()
X_train.shape, X_train.dtype

((20000, 9216), dtype('uint8'))

In [3]:
# Extract (<i>, <j>)th cell as 1-D vector
def extract_cell(v, i, j):
    img = v.reshape(96, 96)
    return img[32*i:32*(i+1), 32*j:32*(j+1)].ravel()

# Turn a <v> of size 9216 (96x96) into 9 vectors of 1024 (32x32)
# by dividing image into 9 cells and un-rolling each cell
# individually
def vec_to_cells(v):
    cells = []
    for i in range(3):
        for j in range(3):
            cells.append(extract_cell(v, i, j))
    return np.array(cells)

In [4]:
X_train9 = np.vstack([vec_to_cells(sample) for sample in X_train])
X_train9.shape, X_train9.dtype

((180000, 1024), dtype('uint8'))

In [5]:
# Return smaller of either Kullback–Leibler Divergences
def kld(p, q):
    p_q = np.sum(p * np.log2(p/q))
    q_p = np.sum(q * np.log2(q/p))
#     if p_q < 0 or q_p < 0:
#         print(p); print(q); print(sum)
#         raise Exception
    return min(p_q, q_p)

In [6]:
# Local Binary Pattern Histogram
def lbp_histogram(v, dim=32):
    img = v.reshape(dim, dim)
    patterns = local_binary_pattern(img, 8, 1, 'uniform')
    hist, _ = np.histogram(patterns, bins=np.arange(59 + 1))
    # if only the first ten bins are non-zero, we can reduce our
    # landmark feature size
    assert hist[10:].sum() == 0
    # add one to avoid divide-by-zero errors later in kld()
    return hist[:10] + 1

In [7]:
# Reduce <X> to <target_size> by eliminating similar features,
# using <distance> as a metric, via an iterative, stochastic
# sampling of <num_subsamples> and eliminating <num_eliminate>
# features at each iteration
def reduce(X, target_size, num_subsample, num_eliminate, distance):
    modulo = 200
    while (len_X := len(X)) > target_size:
        x1, x2 = (randrange(len_X), randrange(len_X))
        distances = [(0, distance(X[x1], X[x2]))]
        for _ in range(num_subsample):
            x1, x2 = (randrange(len_X), randrange(len_X))
            if x1 == x2: continue
            dist = distance(X[x1], X[x2])
            if dist < distances[-1][1]:
                distances.append((x1, dist))
        eliminate = [tup[0] for tup in distances[-num_eliminate:]]
        X = np.delete(X, eliminate, axis=0)
        modulo -= 1
        if (modulo == 0):
            print(len_X, end=' ')
            modulo = 200
    print()
    return X

In [8]:
train_hists = np.apply_along_axis(lbp_histogram, 1, X_train9)
train_hists.shape

(180000, 10)

In [9]:
%time landmarks = reduce(train_hists, 512, 2000, 5, kld)
%time landmarks = reduce(landmarks, 256, 2000, 1, kld)
landmarks.shape, landmarks.dtype

179025 178050 177062 176075 175087 174104 173117 172136 171148 170164 169182 168195 167213 166225 165245 164263 163287 162297 161319 160353 159374 158399 157410 156423 155459 154480 153508 152521 151543 150572 149588 148611 147625 146639 145656 144677 143700 142724 141737 140755 139772 138793 137814 136834 135849 134862 133882 132896 131906 130916 129941 128964 127981 126997 126002 125020 124041 123056 122080 121096 120124 119161 118176 117189 116210 115225 114237 113255 112281 111295 110312 109344 108360 107376 106389 105411 104420 103432 102445 101462 100474 99497 98510 97519 96532 95546 94558 93576 92597 91616 90632 89643 88660 87669 86689 85702 84724 83747 82762 81774 80796 79814 78824 77839 76854 75866 74875 73887 72896 71914 70931 69957 68975 68000 67008 66025 65041 64057 63073 62093 61110 60128 59147 58168 57187 56205 55217 54242 53266 52286 51307 50318 49334 48344 47358 46381 45396 44406 43432 42454 41475 40496 39518 38545 37561 36578 35599 34613 33627 32636 31662 30690 29713 2

((256, 10), dtype('int64'))

In [10]:
landmarks.min(), landmarks.max(), landmarks.mean()

(1, 901, 103.4)

In [11]:
landmarks

array([[ 37,  53, 102, ...,  67,  44,  72],
       [  2,  20,   6, ...,  79, 359,  36],
       [  6,  14,  29, ...,  29, 144,  18],
       ...,
       [  2,   7,  31, ...,  16,  27,   8],
       [ 73,  87,  53, ..., 103, 291, 131],
       [ 22,  40,  23, ...,  60, 524,  45]])

In [12]:
np.savez(DATA_DIR + DATA_FILENAME + '_landmarks', landmarks=landmarks)