In [1]:
import matplotlib.pyplot as plt
import numpy as np
from random import randrange
from skimage.feature import local_binary_pattern

In [2]:
DATA_DIR = 'data/'
DATA_FILENAME = 'data_naive1d_subset20k'
data_train = np.load(DATA_DIR + DATA_FILENAME + '_train' + '.npz')
X_train = data_train['X']
data_train.close()
X_train.shape, X_train.dtype

((20000, 9216), dtype('uint8'))

In [3]:
# Extract (<i>, <j>)th cell as 1-D vector
def extract_cell(v, i, j):
    img = v.reshape(96, 96)
    return img[32*i:32*(i+1), 32*j:32*(j+1)].ravel()

# Turn a <v> of size 9216 (96x96) into 9 vectors of 1024 (32x32)
# by dividing image into 9 cells and un-rolling each cell
# individually
def vec_to_cells(v):
    cells = []
    for i in range(3):
        for j in range(3):
            cells.append(extract_cell(v, i, j))
    return np.array(cells)

In [4]:
X_train9 = np.vstack([vec_to_cells(sample) for sample in X_train])
X_train9.shape, X_train9.dtype

((180000, 1024), dtype('uint8'))

In [5]:
# Kullback–Leibler Divergence
def kld(p, q):
    return np.sum(np.where(p != 0, p * np.log(p / q), 0))

In [6]:
# Local Binary Pattern Histogram
def lbp_histogram(v, dim=32):
    img = v.reshape(dim, dim)
    patterns = local_binary_pattern(img, 8, 1, 'uniform')
    hist, _ = np.histogram(patterns, bins=np.arange(59 + 1))
    # add one to avoid divide-by-zero errors later in kld()
    return hist.astype('int16') + 1

In [7]:
# Reduce <X> to <target_size> by eliminating similar features,
# using <distance> as a metric, via an iterative, stochastic
# sampling of <num_subsamples> and eliminating <num_eliminate>
# features at each iteration
def reduce(X, target_size, num_subsample, num_eliminate, distance):
    modulo = 200
    while (len_X := len(X)) > target_size:
        distances = [(0, distance(X[0], X[1]))]
        for _ in range(num_subsample):
            x1, x2 = (randrange(len_X), randrange(len_X))
            if x1 == x2: continue
            dist = distance(X[x1], X[x2])
            if dist < distances[-1][1]:
                distances.append((x1, dist))
        distances.sort(key=lambda x: x[1])
        eliminate = [tup[0] for tup in distances[:num_eliminate]]
        X = np.delete(X, eliminate, axis=0)
        modulo -= 1
        if (modulo == 0):
            print(len_X, end=' ')
            modulo = 200
    print()
    return X

In [8]:
train_hists = np.apply_along_axis(lbp_histogram, 1, X_train9)
%time landmarks = reduce(train_hists, 512, 2000, 5, kld)
%time landmarks = reduce(landmarks, 256, 2000, 1, kld)
landmarks.shape, landmarks.dtype

179034 178062 177075 176096 175103 174130 173146 172168 171189 170204 169217 168225 167260 166277 165305 164317 163349 162377 161387 160409 159429 158454 157468 156476 155494 154505 153508 152526 151550 150574 149581 148606 147638 146659 145679 144704 143718 142731 141742 140770 139791 138809 137821 136837 135848 134869 133901 132926 131935 130951 129964 128984 128006 127022 126041 125062 124074 123094 122108 121146 120161 119185 118200 117208 116229 115247 114272 113284 112302 111320 110348 109357 108377 107407 106418 105428 104444 103457 102486 101507 100535 99551 98586 97613 96627 95649 94660 93674 92687 91714 90743 89762 88773 87807 86824 85828 84834 83855 82891 81904 80914 79932 78966 77979 76993 76002 75021 74033 73047 72056 71070 70088 69113 68117 67124 66141 65157 64173 63187 62209 61236 60254 59273 58291 57311 56331 55339 54350 53374 52392 51407 50419 49438 48456 47468 46482 45494 44510 43521 42544 41568 40592 39608 38635 37659 36672 35678 34685 33694 32704 31727 30747 29758 2

((256, 59), dtype('int16'))

In [9]:
landmarks.min(), landmarks.max(), landmarks.mean()

(1, 901, 18.35593220338983)

In [10]:
landmarks

array([[ 18,  58,  43, ...,   1,   1,   1],
       [  6,  14,  29, ...,   1,   1,   1],
       [  9,  32,  26, ...,   1,   1,   1],
       ...,
       [ 35,  75,   9, ...,   1,   1,   1],
       [  9,  69,  13, ...,   1,   1,   1],
       [139, 131,  89, ...,   1,   1,   1]], dtype=int16)

In [11]:
np.savez(DATA_DIR + DATA_FILENAME + '_landmarks', landmarks=landmarks)