In [1]:
import h5py
import matplotlib.pyplot as plt
import numpy as np
from random import Random, shuffle, randrange
from skimage.feature import local_binary_pattern

In [2]:
DIR = 'data/'
HDF5_FILENAME = 'data.hdf5'
f = h5py.File(DIR + HDF5_FILENAME, 'r')

In [3]:
train = list(f['train'].keys())
len(train)

220025

In [4]:
train_pos = [name for name in train if f['train'][name].attrs['label'] == 1]
len(train_pos)

89117

In [5]:
RAND_SEED = 333
rand = Random(RAND_SEED)
rand.shuffle(train)

In [6]:
TRAIN_SAMPLES = 20_000
train_sel = [name for name in train[:TRAIN_SAMPLES] if name in train_pos]
num_sel = len(train_sel)
num_sel

8112

In [7]:
num_pixels = f['train'][train_sel[0]].shape[0] * f['train'][train_sel[0]].shape[1]
X_train = np.empty((num_sel, num_pixels), dtype='uint8')

In [8]:
def populate_x(X: np.ndarray, m: int, offs: int):
    for i, name in enumerate(train_sel[offs:offs+m]):
        dset = f['train'][name]
        img = dset[()]
        X[i] = img.ravel()
        assert dset.attrs['label'] == 1

In [9]:
populate_x(X_train, num_sel, 0)

In [10]:
# Extract (<i>, <j>)th cell as 1-D vector
def extract_cell(v, i, j):
    img = v.reshape(96, 96)
    return img[32*i:32*(i+1), 32*j:32*(j+1)].ravel()

# Turn a <v> of size 9216 (96x96) into 9 vectors of 1024 (32x32)
# by dividing image into 9 cells and un-rolling each cell
# individually
def vec_to_cells(v):
    cells = []
    for i in range(3):
        for j in range(3):
            cells.append(extract_cell(v, i, j))
    return np.array(cells)

In [11]:
X_train9 = np.vstack([vec_to_cells(sample) for sample in X_train])
X_train9.shape, X_train9.dtype

((73008, 1024), dtype('uint8'))

In [12]:
# Return smaller of either Kullback–Leibler Divergences
def kld(p, q):
    p_q = np.sum(p * np.log2(p/q))
    q_p = np.sum(q * np.log2(q/p))
#     if p_q < 0 or q_p < 0:
#         print(p); print(q); print(sum)
#         raise Exception
    return min(p_q, q_p)

In [13]:
# Local Binary Pattern Histogram
def lbp_histogram(v, dim=32):
    img = v.reshape(dim, dim)
    patterns = local_binary_pattern(img, 8, 1, 'uniform')
    hist, _ = np.histogram(patterns, bins=np.arange(59 + 1))
    # if only the first ten bins are non-zero, we can reduce our
    # landmark feature size
    assert hist[10:].sum() == 0
    # add one to avoid divide-by-zero errors later in kld()
    return hist[:10].astype('int16') + 1

In [14]:
# Reduce <X> to <target_size> by eliminating similar features,
# using <distance> as a metric, via an iterative, stochastic
# sampling of <num_subsamples> and eliminating <num_eliminate>
# features at each iteration
def reduce(X, target_size, num_subsample, num_eliminate, distance):
    modulo = 200
    while (len_X := len(X)) > target_size:
        x1, x2 = (randrange(len_X), randrange(len_X))
        distances = [(0, distance(X[x1], X[x2]))]
        for _ in range(num_subsample):
            x1, x2 = (randrange(len_X), randrange(len_X))
            if x1 == x2: continue
            dist = distance(X[x1], X[x2])
            if dist < distances[-1][1]:
                distances.append((x1, dist))
        eliminate = [tup[0] for tup in distances[-num_eliminate:]]
        X = np.delete(X, eliminate, axis=0)
        modulo -= 1
        if (modulo == 0):
            print(len_X, end=' ')
            modulo = 200
    print()
    return X

In [15]:
train_hists = np.apply_along_axis(lbp_histogram, 1, X_train9)
train_hists.shape, train_hists.dtype

((73008, 10), dtype('int16'))

In [16]:
%time landmarks = reduce(train_hists, 512, 2000, 5, kld)
%time landmarks = reduce(landmarks, 256, 2000, 1, kld)
landmarks.shape, landmarks.dtype

72030 71056 70068 69082 68093 67105 66127 65147 64166 63178 62187 61209 60230 59246 58268 57287 56310 55323 54344 53358 52382 51394 50415 49430 48460 47474 46493 45522 44535 43553 42567 41582 40593 39608 38637 37650 36674 35692 34720 33729 32736 31755 30767 29783 28795 27820 26840 25851 24865 23885 22916 21929 20945 19965 18979 18001 17023 16044 15065 14086 13107 12118 11136 10161 9174 8206 7220 6237 5258 4286 3307 2325 1347 
CPU times: user 9min 31s, sys: 4.23 s, total: 9min 35s
Wall time: 9min 35s
312 
CPU times: user 9.57 s, sys: 85.3 ms, total: 9.66 s
Wall time: 9.66 s


((256, 10), dtype('int16'))

In [17]:
landmarks.min(), landmarks.max(), landmarks.mean()

(1, 825, 103.4)

In [18]:
landmarks

array([[151, 134,  49, ..., 130, 127, 220],
       [ 55, 108,  47, ...,  86, 117, 125],
       [ 37,  44,  91, ...,  46,  43,  76],
       ...,
       [ 85, 122,  84, ...,  94, 100, 167],
       [ 33,  75,  84, ...,  58,  38,  77],
       [ 30,  92,  10, ...,  68, 248, 127]], dtype=int16)

In [19]:
LAND_FILENAME = 'data_landmarks_pos20k'
np.savez(DIR + LAND_FILENAME, landmarks=landmarks)