In [1]:
from helper import *
from skimage.feature import local_binary_pattern
from pyod.models.copod import COPOD

In [2]:
SUBSET = '20k'
DATA_DIR = 'data/'
DATA_FILENAME = 'lbp_subset' + SUBSET

In [3]:
(X_train, y_train), (X_test, y_test) = \
    load_npz(DATA_DIR + DATA_FILENAME + '.npz')

((20000, 10), dtype('int16')) ((20000,), dtype('uint8'))
((10000, 10), dtype('int16')) ((10000,), dtype('uint8'))


In [4]:
TRAIN_SAMPLES = len(X_train)
train_pos = []
train_neg = []
for x, y in zip(X_train, y_train):
    if y:
        train_pos.append(x)
    else:
        train_neg.append(x)
num_pos = len(train_pos)
num_neg = len(train_neg)
num_pos, num_neg

(8112, 11888)

In [5]:
X_pos = np.array(train_pos)
X_neg = np.array(train_neg)
(X_pos.shape, X_pos.dtype), (X_neg.shape, X_neg.dtype)

(((8112, 10), dtype('int16')), ((11888, 10), dtype('int16')))

In [6]:
%%time
pos_clf = COPOD()
pos_clf.fit(X_pos)
pos_pos_scores = pos_clf.decision_scores_
pos_neg_scores = pos_clf.decision_function(X_neg)

CPU times: user 201 ms, sys: 9.63 ms, total: 211 ms
Wall time: 211 ms


In [7]:
print( pos_pos_scores.mean(), pos_pos_scores.std() )
print( pos_neg_scores.mean(), pos_neg_scores.std() )

13.403675318069116 6.616887190253705
14.617824717863698 7.687808734152937


In [8]:
%%time
neg_clf = COPOD()
neg_clf.fit(X_neg)
neg_neg_scores = neg_clf.decision_scores_
neg_pos_scores = neg_clf.decision_function(X_pos)

CPU times: user 248 ms, sys: 9.47 ms, total: 258 ms
Wall time: 258 ms


In [9]:
print( neg_neg_scores.mean(), neg_neg_scores.std() )
print( neg_pos_scores.mean(), neg_pos_scores.std() )

13.420146988951801 6.662243840264217
11.664317627533586 4.24494775791951


In [10]:
def predict(X: np.ndarray, y: np.ndarray, m: int, offs: int):
    names = []
    count_pos, count_neg = 0, 0
    for i, name in enumerate(train[offs:offs+m]):
        names.append(name)
        dset = f['train'][name]
        img = dset[()]
        hist = lbpu_histogram(img).reshape(1,-1)
        X[i, 0] = pos_clf.decision_function(hist)
        X[i, 1] = neg_clf.decision_function(hist)
        label = dset.attrs['label']
        y[i] = label
        if label == 1:
            count_pos += 1
        elif label == 0:
            count_neg += 1
        else:
            raise Exception
    print(f'Pos: {count_pos/m*100:.1f}, Neg: {count_neg/m*100:.1f}')
    return names

In [11]:
%time
Xf_train = np.column_stack((
    pos_clf.decision_function(X_train).astype('float32'),
    neg_clf.decision_function(X_train).astype('float32'),
))
Xf_train.shape, Xf_train.dtype

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.11 µs


((20000, 2), dtype('float32'))

In [12]:
%time
Xf_test = np.column_stack((
    pos_clf.decision_function(X_test).astype('float32'),
    neg_clf.decision_function(X_test).astype('float32'),
))
Xf_test.shape, Xf_test.dtype

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.96 µs


((10000, 2), dtype('float32'))

In [13]:
SUBSET_FILENAME = 'copod_subset' + SUBSET
np.savez(DATA_DIR + SUBSET_FILENAME,
         X_train=Xf_train, y_train=y_train,
         X_test=Xf_test, y_test=y_test)

In [14]:
GBT(Xf_train, Xf_test, y_train, y_test)

n_est: Train, Test
400  : 73.4,  70.4
600  : 74.4,  69.5
800  : 75.4,  68.8
