In [1]:
import h5py
import os
import numpy as np
from imageio import imread
from random import Random, shuffle
from skimage.color import rgb2hed

In [2]:
DATA_DIR = 'data/'
TRAIN_DIR = 'data/train/'
HDF5_FILENAME = 'data.hdf5'

In [3]:
f = h5py.File(DATA_DIR + HDF5_FILENAME, 'r')
train = list(f['train'].keys())

In [4]:
RAND_SEED = 333
rand = Random(RAND_SEED)
rand.shuffle(train)

In [5]:
SUBSET = '200k'
TRAIN_SAMPLES = 200_000 if SUBSET == '200k' else 20_000 if SUBSET == '20k' else 4_000
TEST_SAMPLES = 20_000 if SUBSET == '200k' else 10_000 if SUBSET == '20k' else 2_000
img_size = f['train'][train[0]].shape[:2]
X_train = np.empty((TRAIN_SAMPLES, *img_size, 3), dtype='uint8')
y_train = np.empty(TRAIN_SAMPLES, dtype='uint8')
X_test  = np.empty((TEST_SAMPLES, *img_size, 3), dtype='uint8')
y_test  = np.empty(TEST_SAMPLES, dtype='uint8')

In [6]:
def populate_xy(X: np.ndarray, y: np.ndarray, m: int, offs: int):
    names = []
    count_pos, count_neg = 0, 0
    for i, name in enumerate(train[offs:offs+m]):
        names.append(name)
        # img_gry = f['train/gry'][name][()]
        # img2_gry = cv2.resize(img_gry, None, fx=0.5, fy=0.5, interpolation = cv2.INTER_CUBIC)
        img_ghed = f['train'][name][()]
        # img_ghed => Ch0: G*, Ch1: H*, Ch2: E*, Ch3: D
        # X[i]  = np.column_stack((img_ghed[:, :, 1].ravel(), img_ghed[:, :, 2].ravel()))
        X[i] = img_ghed[:, :, :3]
        label = f['train'][name].attrs['label']
        y[i]  = label
        if label == 1:
            count_pos += 1
        elif label == 0:
            count_neg += 1
        else:
            raise Exception
    print(f'Pos: {count_pos/m*100:.1f}, Neg: {count_neg/m*100:.1f}')
    return names

In [7]:
train_names = populate_xy(X_train, y_train, TRAIN_SAMPLES, 0)
test_names = populate_xy(X_test, y_test, TEST_SAMPLES, TRAIN_SAMPLES)

Pos: 40.5, Neg: 59.5
Pos: 40.2, Neg: 59.8


In [8]:
print(X_train.shape, X_train.dtype)
print(X_test.shape, X_test.dtype)

(200000, 96, 96, 3) uint8
(20000, 96, 96, 3) uint8


In [9]:
SUBSET_FILENAME = '3ch_subset' + SUBSET
np.savez(DATA_DIR + SUBSET_FILENAME + '.npz',
         X_train=X_train, y_train=y_train,
         X_test=X_test, y_test=y_test,
         train_names=train_names, test_names=test_names)