## loading data for efficient training

The generators used previously work fine, but are slow compared to other methods, as mentioned by the tensorflow team [here](https://www.tensorflow.org/tutorials/load_data/images#load_using_tfdata).

Because the goal is to test a lot of different models on the data, speed is an important factor.
For this the tensorflow dataset API is explored here.

Please note, that the data augmentation previously made possible by Keras' ImageDataGenerator class is removed and has to be applied manually to achieve the previously seen results.

In [1]:
from os.path import join

raw = join('data', 'raw')
processed = join('data', 'processed')

from src.training_env import reset_and_populate

reset_and_populate(raw, processed, [400,0,100])

['data\\processed\\train\\n',
 'data\\processed\\validation\\n',
 'data\\processed\\test\\n',
 'data\\processed\\train\\o',
 'data\\processed\\validation\\o',
 'data\\processed\\test\\o',
 'data\\processed\\train\\x',
 'data\\processed\\validation\\x',
 'data\\processed\\test\\x']

In [2]:
import tensorflow as tf
import numpy as np
from os import sep
import pathlib

def create_dataset(data_dir, batch_size=32, shuffle_buffer_size=1000):
        def get_label(file_path):
                parts = tf.strings.split(file_path, sep)
                return parts[-2] == labels

        def decode_img(img):
                img = tf.image.decode_png(img, channels=1)
                img = tf.image.convert_image_dtype(img, tf.float32)
                return tf.image.resize(img, [32, 32])

        def process_path(file_path):
                label = get_label(file_path)
                img = tf.io.read_file(file_path)
                img = decode_img(img)
                return img, label

        data_dir = pathlib.Path(join(processed, data_dir))

        labels = np.array([item.name for item in data_dir.glob('*')])

        autotune = tf.data.experimental.AUTOTUNE

        ds = (tf.data.Dataset.list_files(str(data_dir/'*/*'))
                .map(process_path, num_parallel_calls=autotune)
                .cache()
                .shuffle(shuffle_buffer_size)
                .repeat()
                .batch(batch_size)
                .prefetch(buffer_size=autotune))
        return iter(ds)

train_generator = create_dataset('train', 20)
test_generator = create_dataset('test', 10)

In [3]:
from tensorflow.keras import layers
from tensorflow.keras import models

model = models.Sequential()
model.add(layers.Flatten(input_shape=(32, 32, 1)))
model.add(layers.Dense(32,'relu'))
model.add(layers.Dense(32,'relu'))
model.add(layers.Dense(3, 'softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 1024)              0         
_________________________________________________________________
dense (Dense)                (None, 32)                32800     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 99        
Total params: 33,955
Trainable params: 33,955
Non-trainable params: 0
_________________________________________________________________


In [4]:
from tensorflow.keras.optimizers import SGD, RMSprop

optimizer = SGD(lr=0.005, momentum=0.9, nesterov=True)

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

In [5]:
from tensorflow.keras.callbacks import TensorBoard
import numpy as np
from datetime import datetime
from os import mkdir

log_dir = join('logs', 'srp01', datetime.now().strftime("%Y-%m-%dT%H-%M-%S"))

callbacks = [ TensorBoard(
    log_dir=log_dir,
    histogram_freq=1,
    embeddings_freq=1) ]

history = model.fit_generator(
    train_generator,
    steps_per_epoch=20,
    epochs=20,
    callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [6]:
model.evaluate_generator(test_generator, steps=10)

[0.7228875041007996, 0.7]