In [1]:
import h5py
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
from imageio import imread
from random import Random, shuffle

In [2]:
DIR = 'data/'
HDF5_FILENAME = 'data.hdf5'
f = h5py.File(DIR + HDF5_FILENAME, 'r')

In [3]:
train = list(f['train'].keys())

In [4]:
RAND_SEED = 333
rand = Random(RAND_SEED)
rand.shuffle(train)

In [5]:
TRAIN_SAMPLES = 20_000
TEST_SAMPLES = 10_000
num_pixels = f['train'][train[0]].shape[0] * f['train'][train[0]].shape[1]
X_train = np.empty((TRAIN_SAMPLES, num_pixels), dtype='uint8')
y_train = np.empty(TRAIN_SAMPLES, dtype='uint8')
X_test  = np.empty((TEST_SAMPLES, num_pixels), dtype='uint8')
y_test  = np.empty(TEST_SAMPLES, dtype='uint8')

In [6]:
def populate_xy(X: np.ndarray, y: np.ndarray, m: int, offs: int):
    names = []
    count_pos, count_neg = 0, 0
    for i, name in enumerate(train[offs:offs+m]):
        names.append(name)
        dset = f['train'][name]
        img = dset[()]
        X[i] = img.ravel()
        label = dset.attrs['label']
        y[i] = label
        if label == 1:
            count_pos += 1
        elif label == 0:
            count_neg += 1
        else:
            raise Exception
    print(f'Pos: {count_pos/m*100:.1f}, Neg: {count_neg/m*100:.1f}')
    return names

In [7]:
train_names = populate_xy(X_train, y_train, TRAIN_SAMPLES, 0)
test_names = populate_xy(X_test, y_test, TEST_SAMPLES, TRAIN_SAMPLES)

Pos: 40.6, Neg: 59.4
Pos: 39.8, Neg: 60.2


In [8]:
SUBSET_FILENAME = 'data_naive1d_subset20k'
np.savez(DIR + SUBSET_FILENAME + '_train', X=X_train, y=y_train, names=train_names)
np.savez(DIR + SUBSET_FILENAME + '_test', X=X_test, y=y_test, names=test_names)