In [1]:
import matplotlib.pyplot as plt
import numpy as np
from random import randrange
from skimage.feature import local_binary_pattern

In [2]:
DATA_DIR = 'data/'
DATA_FILENAME = 'data_lbp_subset20k'

In [3]:
data_train = np.load(DATA_DIR + DATA_FILENAME + '_train' + '.npz')
X_train = data_train['X']
y_train = data_train['y']
train_names = data_train['names']
data_train.close()
X_train.shape, X_train.dtype

((20000, 90), dtype('int16'))

In [4]:
data_test = np.load(DATA_DIR + DATA_FILENAME + '_test' + '.npz')
X_test = data_test['X']
y_test = data_test['y']
test_names = data_test['names']
data_test.close()
X_test.shape, X_test.dtype

((10000, 90), dtype('int16'))

In [5]:
LAND_FILENAME = 'data_landmarks_subset4k'
data_land = np.load(DATA_DIR + LAND_FILENAME + '.npz')
landmarks = data_land['landmarks']
data_land.close()
landmarks.shape, landmarks.dtype

((256, 10), dtype('int16'))

In [6]:
# Return smaller of either Kullback–Leibler Divergences
def kld(p, q):
    p_q = np.sum(p * np.log2(p/q))
    q_p = np.sum(q * np.log2(q/p))
#     if p_q < 0 or q_p < 0:
#         print(p); print(q); print(sum)
#         raise Exception
    return min(p_q, q_p)

In [7]:
def kld_v(v):
    return lambda z: kld(v, z)
#     kld(landmarks[v[0]], v[1:])

In [8]:
# Return index of most similar landmark
def lbp_to_landmark(v):
    land_dists = np.apply_along_axis(kld_v(v), 1, landmarks)
    return land_dists
    # return np.argmin(land_dists)

In [9]:
# Return 1-D vector of size 2304 (9x256) corresponding to per cell
# KL-D similarity to each of 256 landmarks
def generate_feat(v):
    v9 = v.reshape(-1, landmarks.shape[1])
    land_dists9 = np.apply_along_axis(lbp_to_landmark, 1, v9)
    return land_dists9.ravel()

In [10]:
%time train_feat = np.apply_along_axis(generate_feat, 1, X_train)
train_feat.shape, train_feat.dtype

CPU times: user 14min 59s, sys: 6.29 s, total: 15min 5s
Wall time: 15min 2s


((20000, 2304), dtype('float64'))

In [11]:
%time test_feat = np.apply_along_axis(generate_feat, 1, X_test)
test_feat.shape, test_feat.dtype

CPU times: user 7min 47s, sys: 4.43 s, total: 7min 51s
Wall time: 13min 22s


((10000, 2304), dtype('float64'))

In [12]:
SUBSET_FILENAME = 'data_ldist_subset20k'
np.savez(DATA_DIR + SUBSET_FILENAME + '_train', X=train_feat, y=y_train, names=train_names)
np.savez(DATA_DIR + SUBSET_FILENAME + '_test', X=test_feat, y=y_test, names=test_names)