# Save data to hdf5

In [None]:
from pathlib import Path
import h5py
import numpy as np
from PIL import Image

In [None]:
def _preproccess_one(filename, size):
    img = Image.open(filename)
    img = img.convert('RGB')
    img = img.resize(size=size, resample=Image.LANCZOS)
    img = np.asarray(img) / 255.0
    return img.astype(np.float32)

def _preprocess(path, size, label):
    files = list(path.glob('*'))
    nfiles = len(files)
    x = np.zeros((nfiles, *size, 3), dtype=np.float32)
    print(f"0 / {nfiles}", end='\r')
    for j, f in enumerate(files):
        x[j] = _preproccess_one(f, size=size)
        print(f"{j + 1} / {nfiles}", end='\r')
    print()
    return x, np.zeros(nfiles, dtype=np.uint8) + label

In [None]:
def save_one_to_hdf5(fp, size):
    print(f"++ Saving size {size}")
    s = "_".join(map(str, size))
    x0, y0 = _preprocess(Path('data/gbm/'), size=size, label=0)
    x1, y1 = _preprocess(Path('data/pcnsl/'), size=size, label=1)
    with h5py.File(fp, mode='a') as f:
        f.create_dataset(f'/gbm/{s}/features', data=x0, compression='lzf')
        f.create_dataset(f'/gbm/{s}/labels', data=y0, compression='lzf')
        f.create_dataset(f'/pcnsl/{s}/features', data=x1, compression='lzf')
        f.create_dataset(f'/pcnsl/{s}/labels', data=y1, compression='lzf')

sizes = [(224, 224), (300, 300), (380, 380), (600, 600)]

for size in sizes:
    save_one_to_hdf5('data.h5', size=size)

# Load data

In [None]:
import h5py
import numpy as np
import tensorflow as tf

tfk = tf.keras
tfkl = tfk.layers

def load_data(filename, size="224_224"):
    s = size
    with h5py.File(filename, 'r') as f:
        x_gbm = f[f'/gbm/{s}/features'][:]
        y_gbm = f[f'/gbm/{s}/labels'][:]
        x_pcnsl = f[f'/pcnsl/{s}/features'][:]
        y_pcnsl = f[f'/pcnsl/{s}/labels'][:]

    # Transform from range [0, 1] to range [-1, 1].
    x_pcnsl *= 2.0
    x_pcnsl -= 1.0
    x_gbm *= 2.0
    x_gbm -= 1.0

    print("gbm features shape", x_gbm.shape)
    print("gbm labels shape", y_gbm.shape)
    print("pcnsl features shape", x_pcnsl.shape)
    print("pcnsl labels shape", y_pcnsl.shape)
    
    x = np.concatenate((x_gbm, x_pcnsl))
    y = np.concatenate((y_gbm, y_pcnsl))
    y = y.astype(np.float32)
    
    # Shuffling turns out to be very important...
    shuffle_inds = np.arange(y.shape[0])
    np.random.seed(42)
    np.random.shuffle(shuffle_inds)
    x = x[shuffle_inds]
    y = y[shuffle_inds]
    
    inds = np.random.choice([0, 1, 2], size=y.size, p=[0.7, 0.2, 0.1])
    x_train, y_train = x[inds==0], y[inds==0]
    x_val, y_val = x[inds==1], y[inds==1]
    x_test, y_test = x[inds==2], y[inds==2]
    
    return (x_train, y_train), (x_val, y_val), (x_test, y_test)

def get_datasets(filename, size="224_224", batch_size=32):
    (x_train, y_train), (x_val, y_val), (x_test, y_test) = load_data(filename, size=size)
    
    n_train = y_train.shape[0] // batch_size
    n_val = y_val.shape[0] // batch_size
    n_test = y_test.shape[0] // batch_size

    def augment(x, y):
        x = tf.image.random_brightness(x, max_delta=2)
        x = tf.image.random_flip_left_right(x)
        x = tf.image.random_flip_up_down(x)
        x = tf.image.random_hue(x, max_delta=0.25)
        return x, y

    d_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    d_train = d_train.map(augment, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    d_train = d_train.shuffle(1000, reshuffle_each_iteration=True)
    d_train = d_train.batch(batch_size, drop_remainder=True)
    d_train = d_train.repeat()
    
    d_val = tf.data.Dataset.from_tensor_slices((x_val, y_val))
    d_val = d_val.batch(batch_size, drop_remainder=True)
    d_val = d_val.repeat()
    
    d_test = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    d_test = d_test.batch(batch_size, drop_remainder=True)
    d_test = d_test.repeat()
    
    return (d_train, n_train), (d_val, n_val), (d_test, n_test)