# Generate Naive 1-D flattened .npz from HDF5 for Shallow-ML

In [1]:
import h5py
import os
import cv2
import numpy as np
from imageio import imread
from random import Random, shuffle

In [2]:
DATA_DIR = 'data/'
HDF5_FILENAME = 'data.hdf5'

In [3]:
f = h5py.File(DATA_DIR + HDF5_FILENAME, 'r')
train = list(f['train'].keys())

In [4]:
RAND_SEED = 333
rand = Random(RAND_SEED)
rand.shuffle(train)

In [5]:
SUBSET = '200k'
TRAIN_SAMPLES = 200_000 if SUBSET == '200k' else 20_000 if SUBSET == '20k' else 4_000
TEST_SAMPLES = 20_000 if SUBSET == '200k' else 10_000 if SUBSET == '20k' else 2_000
num_pixels = f['train'][train[0]].shape[0] * f['train'][train[0]].shape[1]
X_train = np.empty((TRAIN_SAMPLES, num_pixels), dtype='uint8')
X2_train = np.empty((TRAIN_SAMPLES, num_pixels//4), dtype='uint8')
y_train = np.empty(TRAIN_SAMPLES, dtype='uint8')
X_test  = np.empty((TEST_SAMPLES, num_pixels), dtype='uint8')
X2_test  = np.empty((TEST_SAMPLES, num_pixels//4), dtype='uint8')
y_test  = np.empty(TEST_SAMPLES, dtype='uint8')

In [6]:
def populate_xy(X: np.ndarray, X2: np.ndarray, y: np.ndarray, m: int, offs: int):
    names = []
    count_pos, count_neg = 0, 0
    for i, name in enumerate(train[offs:offs+m]):
        names.append(name)
        dset = f['train'][name]
        img = dset[()][:, :, 0]  # Ch0: G*, Ch1: H, Ch2: E, Ch3: D
        img2 = cv2.resize(img, None, fx=0.5, fy=0.5, interpolation = cv2.INTER_CUBIC)
        X[i] = img.ravel()
        X2[i] = img2.ravel()
        label = dset.attrs['label']
        y[i] = label
        if label == 1:
            count_pos += 1
        elif label == 0:
            count_neg += 1
        else:
            raise Exception
    print(f'Pos: {count_pos/m*100:.1f}, Neg: {count_neg/m*100:.1f}')
    return names

In [7]:
train_names = populate_xy(X_train, X2_train, y_train, TRAIN_SAMPLES, 0)
test_names = populate_xy(X_test, X2_test, y_test, TEST_SAMPLES, TRAIN_SAMPLES)

Pos: 40.5, Neg: 59.5
Pos: 40.2, Neg: 59.8


In [8]:
SUBSET_FILENAME = '1d_subset' + SUBSET
np.savez(DATA_DIR + SUBSET_FILENAME + '.npz',
         X_train=X_train, X2_train=X2_train, y_train=y_train,
         X_test=X_test, X2_test=X2_test, y_test=y_test,
         train_names=train_names, test_names=test_names)