https://biswajitsahoo1111.github.io/post/efficiently-reading-multiple-files-in-tensorflow-2/

https://biswajitsahoo1111.github.io/post/reading-multiple-files-in-tensorflow-2-using-sequence/

https://biswajitsahoo1111.github.io/post/doing-linear-algebra-using-tensorflow-2/


In [1]:
import numpy as np
import tensorflow as tf
print("Tensorflow Version: ", tf.__version__)


Tensorflow Version:  2.5.0


In [2]:
data = np.random.randint(100, 150, size=(10, 2, 2))
labels = np.random.permutation(10)


In [7]:
def my_generator(data, labels, batch_size=2):
    i = 0
    while True:
        if i * batch_size >= len(labels):
            i = 0
            idx = np.random.permutation(len(labels))
            data, labels = data[idx], labels[idx]
            continue
        else:
            X = data[i * batch_size:(i + 1) * batch_size, :]
            y = labels[i * batch_size:(i + 1) * batch_size]
            i += 1
            yield X, y


In [9]:
get_data = my_generator(data, labels)
for i in range(10):
    X, y = next(get_data)
    print(X.shape, y.shape)


(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)


In the above generator code, we manually shuffled the data between epochs. But in TensorFlow we can use Sequence class to do this for us automatically. The added advantage of using this class is that we can use multiprocessing capabilities. So the new generator code becomes:


In [10]:
from tensorflow.keras.utils import Sequence


In [11]:
class tf_my_generator(Sequence):
    def __init__(self, data, labels, batch_size=2):
        self.x, self.y = data, labels
        self.batch_size = batch_size
        self.indices = np.arange(self.x.shape[0])

    def __len__(self):
        return tf.math.floor(self.x.shape[0] / self.batch_size)

    def __getitem__(self, idx):
        inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x = self.x[inds]
        batch_y = self.y[inds]
        return batch_x, batch_y

    def on_epoch_end(self):
        np.random.shuffle(self.indices)


In [12]:
get_new_data = tf_my_generator(data, labels)
for i in range(10):
    if i == 5:
        get_new_data.on_epoch_end()
        i = 0
    elif i > 5:
        i = i-5
    X, y = get_new_data.__getitem__(i)
    print(X.shape, y.shape)


(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)
(2, 2, 2) (2,)


Both the generators work fine. Now we will use it to implement a CNN model on MNIST data. Note that this example is bit stretched and strange. We don't need generators to implement small data sets like MNIST. Whole of MNIST can be loaded into RAM. By this example the aim is just to show a different way of implementing it using generators. Of course the codes can be modified to handle cases where we indeed need generators to do analysis.


In [22]:
(train_data, train_labels), (test_data,
                             test_labels) = tf.keras.datasets.mnist.load_data()
train_data = train_data.reshape(60000, 28, 28, 1)/255.
id = np.random.permutation(len(train_labels))
training_data, training_labels = train_data[id[0:48000]
                                            ], train_labels[id[0:48000]]
val_data, val_labels = train_data[id[48000:60000]
                                  ], train_labels[id[48000:60000]]


In [17]:
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense
from tensorflow.keras.models import Sequential


In [18]:
model = Sequential([
    Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
    MaxPool2D(2),
    Conv2D(64, 5, activation='relu'),
    MaxPool2D(2),
    Flatten(),
    Dense(32, activation='relu'),
    Dense(10, activation='sigmoid')
])
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])


In [19]:
# Keras requires the generator to run indefinitely
class data_gen(Sequence):
    def __init__(self, data, labels, batch_size=128):
        self.x, self.y = data, labels
        self.batch_size = batch_size
        self.indices = np.arange(self.x.shape[0])

    def __len__(self):
        return int(tf.math.ceil(self.x.shape[0] / self.batch_size))

    def __getitem__(self, idx):
        inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x = self.x[inds]
        batch_y = self.y[inds]
        return batch_x, tf.keras.utils.to_categorical(batch_y)

    def on_epoch_end(self):
        np.random.shuffle(self.indices)


In [20]:
train_gen = data_gen(train_data, train_labels, batch_size=128)
val_gen = data_gen(val_data, val_labels, batch_size=128)
batch_size = 128
steps_per_epoch = np.floor(len(train_labels)/batch_size)
val_steps = np.floor(len(val_labels)/batch_size)


In [21]:
model.fit(train_gen, steps_per_epoch=steps_per_epoch, epochs=10,
          validation_data=val_gen, validation_steps=val_steps)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1bed8a2ab08>

In [23]:
test_loss, test_accuracy = model.evaluate(test_data.reshape(
    10000, 28, 28, 1)/255., tf.keras.utils.to_categorical(test_labels), verbose=2)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)


313/313 - 2s - loss: 0.0329 - accuracy: 0.9898
Test Loss: 0.03287464752793312
Test Accuracy: 0.989799976348877
