In [1]:
import h5py
import matplotlib.pyplot as plt
import numpy as np
from random import Random, shuffle
from skimage.feature import local_binary_pattern
from pyod.models.copod import COPOD
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [2]:
DIR = 'data/'
HDF5_FILENAME = 'data.hdf5'
f = h5py.File(DIR + HDF5_FILENAME, 'r')

In [3]:
train = list(f['train'].keys())
len(train)

220025

In [4]:
names_pos = {name for name in train if f['train'][name].attrs['label'] == 1}
len(names_pos)

89117

In [5]:
RAND_SEED = 333
rand = Random(RAND_SEED)
rand.shuffle(train)

In [6]:
TRAIN_SAMPLES = 20_000
TEST_SAMPLES = 10_000
train_pos = []
train_neg = []
for name in train[:TRAIN_SAMPLES]:
    if name in names_pos:
        train_pos.append(name)
    else:
        train_neg.append(name)
num_pos = len(train_pos)
num_neg = len(train_neg)
num_pos, num_neg

(8112, 11888)

In [7]:
num_pixels = f['train'][train[0]].shape[0] * f['train'][train[0]].shape[1]
X_pos = np.empty((num_pos, num_pixels), dtype='uint8')
X_neg = np.empty((num_neg, num_pixels), dtype='uint8')

In [8]:
def populate_x(X: np.ndarray, names, label):
    for i, name in enumerate(names):
        dset = f['train'][name]
        img = dset[()]
        X[i] = img.ravel()
        assert dset.attrs['label'] == label

In [9]:
populate_x(X_pos, train_pos, 1)
X_pos.shape

(8112, 9216)

In [10]:
populate_x(X_neg, train_neg, 0)
X_neg.shape

(11888, 9216)

In [11]:
# Local Binary Pattern Histogram
def lbp_histogram(v, dim=96):
    img = v.reshape(dim, dim)
    patterns = local_binary_pattern(img, 8, 1, 'uniform')
    hist, _ = np.histogram(patterns, bins=np.arange(59 + 1))
    # if only the first ten bins are non-zero, we can reduce our
    # landmark feature size
    assert hist[10:].sum() == 0
    # add one to avoid divide-by-zero errors later in kld()
    return hist[:10].astype('int16') + 1

In [12]:
pos_hists = np.apply_along_axis(lbp_histogram, 1, X_pos)
pos_hists.shape, pos_hists.dtype

((8112, 10), dtype('int16'))

In [13]:
neg_hists = np.apply_along_axis(lbp_histogram, 1, X_neg)
neg_hists.shape, neg_hists.dtype

((11888, 10), dtype('int16'))

In [14]:
pos_clf = COPOD()
pos_clf.fit(pos_hists)
pos_pos_scores = pos_clf.decision_scores_
pos_neg_scores = pos_clf.decision_function(neg_hists)

In [15]:
print( pos_pos_scores.mean(), pos_pos_scores.std() )
print( pos_neg_scores.mean(), pos_neg_scores.std() )

13.403675318069116 6.616887190253705
14.617824717863698 7.687808734152937


In [16]:
neg_clf = COPOD()
neg_clf.fit(neg_hists)
neg_neg_scores = neg_clf.decision_scores_
neg_pos_scores = neg_clf.decision_function(pos_hists)

In [17]:
print( neg_neg_scores.mean(), neg_neg_scores.std() )
print( neg_pos_scores.mean(), neg_pos_scores.std() )

13.420146988951801 6.662243840264217
11.664317627533586 4.24494775791951


In [18]:
def populate_xy(X: np.ndarray, y: np.ndarray, m: int, offs: int):
    names = []
    count_pos, count_neg = 0, 0
    for i, name in enumerate(train[offs:offs+m]):
        names.append(name)
        dset = f['train'][name]
        img = dset[()]
        hist = lbp_histogram(img.ravel()).reshape(1,-1)
        X[i, 0] = pos_clf.decision_function(hist)
        X[i, 1] = neg_clf.decision_function(hist)
        label = dset.attrs['label']
        y[i] = label
        if label == 1:
            count_pos += 1
        elif label == 0:
            count_neg += 1
        else:
            raise Exception
    print(f'Pos: {count_pos/m*100:.1f}, Neg: {count_neg/m*100:.1f}')
    return names

In [19]:
Xo_train = np.empty((TRAIN_SAMPLES, 2), dtype='float32')
y_train = np.empty(TRAIN_SAMPLES, dtype='uint8')
Xo_test  = np.empty((TEST_SAMPLES, 2), dtype='float32')
y_test  = np.empty(TEST_SAMPLES, dtype='uint8')

In [20]:
%time train_names = populate_xy(Xo_train, y_train, TRAIN_SAMPLES, 0)
%time test_names = populate_xy(Xo_test, y_test, TEST_SAMPLES, TRAIN_SAMPLES)

Pos: 40.6, Neg: 59.4
CPU times: user 24min 15s, sys: 9.12 s, total: 24min 24s
Wall time: 28min 49s
Pos: 39.8, Neg: 60.2
CPU times: user 12min 50s, sys: 8.27 s, total: 12min 58s
Wall time: 13min 19s


In [79]:
clf1 = GradientBoostingClassifier(n_estimators=100)
clf1.fit(Xo_train, y_train)

GradientBoostingClassifier()

In [80]:
clf1.score(Xo_train, y_train)

0.71495

In [81]:
yo_hat = clf1.predict(Xo_test)
accuracy = metrics.accuracy_score(y_test, yo_hat)
print(accuracy)

0.7155


In [24]:
DATA_FILENAME = '2lbp_subset20k'

In [64]:
data_train = np.load(DIR + DATA_FILENAME + '_train' + '.npz')
Xl_train = data_train['X']
yl_train = data_train['y']
assert np.all(data_train['names'] == train_names)
data_train.close()
Xl_train.shape, yl_train.shape

((20000, 20), (20000,))

In [65]:
data_test = np.load(DIR + DATA_FILENAME + '_test' + '.npz')
Xl_test = data_test['X']
yl_test = data_test['y']
assert np.all(data_test['names'] == test_names)
data_test.close()
Xl_test.shape, yl_test.shape

((10000, 20), (10000,))

In [66]:
Xf_train = np.hstack((Xo_train, Xl_train))
Xf_test = np.hstack((Xo_test, Xl_test))

In [96]:
clf2 = GradientBoostingClassifier(n_estimators=800)
clf2.fit(Xf_train, y_train)

GradientBoostingClassifier(n_estimators=800)

In [97]:
clf2.score(Xf_train, y_train)

0.8651

In [98]:
y2_hat = clf2.predict(Xf_test)
accuracy = metrics.accuracy_score(y_test, y2_hat)
print(accuracy)

0.8028


In [None]:
# LAND_FILENAME = 'od.npz'
# np.savez(DIR + LAND_FILENAME, )