In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, Reshape, MaxPooling2D, LSTM, Dense, Flatten, concatenate
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow.keras import mixed_precision
from keras.layers import Dropout, BatchNormalization
from keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
import os
import time

2023-06-22 14:05:37.373749: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-22 14:05:38.602966: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
# Parameters
height = 64
width = 64
num_classes = 3

# Define the number of batches
train_num_batches = 21
test_num_batches = 7
batch_size = 256
grad_accum_steps = 2  # define your gradient accumulation steps

# Set up mixed precision training
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

In [11]:
# Check GPUs available
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)


4 Physical GPUs, 4 Logical GPUs


In [12]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  4


In [13]:
import psutil

# Gives a human-readable format of bytes
def bytes_to_gb(bytes):
    return bytes / 1024 / 1024 / 1024

total_memory_gb = bytes_to_gb(psutil.virtual_memory().total)

print(f'Total memory: {total_memory_gb} GB')


Total memory: 187.52114486694336 GB


In [14]:
# Custom data generator for batch loading
class DataGenerator(Sequence):
    def __init__(self, batch_num, batch_size, is_train=True):
        self.batch_num = batch_num
        self.batch_size = batch_size
        self.is_train = is_train
        self.total_samples = 0
        self.data_cache = {}
        # Load all data into memory and reshape it
        for file_idx in range(batch_num):
            self._load_data(file_idx)

    def _load_data(self, file_idx):
        if self.is_train:
            batch_data = np.load(f'Datax64/train_data_batch{file_idx+1}.npz')
        else:
            batch_data = np.load(f'Datax64/test_data_batch{file_idx+1}.npz')
        
        X_left = batch_data['X_left_train' if self.is_train else 'X_left_test']
        X_right = batch_data['X_right_train' if self.is_train else 'X_right_test']
        y = batch_data['y_train' if self.is_train else 'y_test']

        # Reshape the data
        X_left = X_left.reshape(X_left.shape[0], X_left.shape[1], X_left.shape[2], 1)
        X_right = X_right.reshape(X_right.shape[0], X_right.shape[1], X_right.shape[2], 1)

        # Convert labels to one-hot encoding
        y = to_categorical(y, num_classes=3)

        # Cache the reshaped data
        self.data_cache[file_idx] = (X_left, X_right, y)
        
        self.total_samples += len(y)
        
    #commented to test another method
    # def __init__(self, batch_num, batch_size, is_train=True):
    #     self.batch_num = batch_num
    #     self.batch_size = batch_size
    #     self.is_train = is_train
    #     self.total_samples = self._compute_total_samples()

    def _compute_total_samples(self):
        total_samples = 0
        for i in range(self.batch_num):
            if self.is_train:
                total_samples += np.load(f'Datax64/train_data_batch{i+1}.npz')['X_left_train'].shape[0]
            else:
                total_samples += np.load(f'Datax64/test_data_batch{i+1}.npz')['X_left_test'].shape[0]
        return total_samples

    def __len__(self):
        return int(np.ceil(self.total_samples / self.batch_size))
        
    def __getitem__(self, idx):
        file_idx = idx // (self.batch_size * self.batch_num)
        inner_idx = idx % (self.batch_size * self.batch_num)
    
        X_left, X_right, y = self.data_cache[file_idx]
    
        min_len = min(len(X_left), len(X_right), len(y))
        X_left = X_left[:min_len]
        X_right = X_right[:min_len]
        y = y[:min_len]
    
        X_left = X_left[inner_idx * self.batch_size : (inner_idx + 1) * self.batch_size]
        X_right = X_right[inner_idx * self.batch_size : (inner_idx + 1) * self.batch_size]
        y = y[inner_idx * self.batch_size : (inner_idx + 1) * self.batch_size]
    
        return [X_left, X_right], y


# Comment for a test
    # def __getitem__(self, idx):
    #     file_idx = idx // (self.batch_size * self.batch_num)
    #     inner_idx = idx % (self.batch_size * self.batch_num)
    
    #     if self.is_train:
    #         batch_data = np.load(f'Datax64/train_data_batch{file_idx+1}.npz')
    #     else:
    #         batch_data = np.load(f'Datax64/test_data_batch{file_idx+1}.npz')
        
    #     X_left_batch = batch_data['X_left_train' if self.is_train else 'X_left_test']
    #     X_right_batch = batch_data['X_right_train' if self.is_train else 'X_right_test']
    #     y_batch = batch_data['y_train' if self.is_train else 'y_test']

    #     min_len = min(len(X_left_batch), len(X_right_batch), len(y_batch))
    #     X_left_batch = X_left_batch[:min_len]
    #     X_right_batch = X_right_batch[:min_len]
    #     y_batch = y_batch[:min_len]
    
    #     X_left_batch = X_left_batch[inner_idx * self.batch_size : (inner_idx + 1) * self.batch_size]
    #     X_right_batch = X_right_batch[inner_idx * self.batch_size : (inner_idx + 1) * self.batch_size]
    #     y_batch = y_batch[inner_idx * self.batch_size : (inner_idx + 1) * self.batch_size]
    
    #     # If your data is not already in the correct shape, do the reshaping here
    #     X_left_batch = X_left_batch.reshape(X_left_batch.shape[0], X_left_batch.shape[1], X_left_batch.shape[2], 1)
    #     X_right_batch = X_right_batch.reshape(X_right_batch.shape[0], X_right_batch.shape[1], X_right_batch.shape[2], 1)
    
    #     y_batch = to_categorical(y_batch, num_classes=3)

    #     # Remove these print statements for production
    #     # print('Batch input shape:', (X_left_batch.shape, X_right_batch.shape))  
    #     # print('Batch label shape:', y_batch.shape)
    
    #     return [X_left_batch, X_right_batch], y_batch

# # Custom data generator for batch loading
# class DataGenerator(Sequence):
#     def __init__(self, batch_num, batch_size, is_train=True):
#         self.batch_num = batch_num
#         self.batch_size = batch_size
#         self.is_train = is_train
        
#     def total_samples(self):
#         total_samples = 0
#         for i in range(self.batch_num):
#             if self.is_train:
#                 total_samples += np.load(f'Datax64/train_data_batch{i+1}.npz')['X_left_train'].shape[0]
#             else:
#                 total_samples += np.load(f'Datax64/test_data_batch{i+1}.npz')['X_left_test'].shape[0]
#         return total_samples


#     def __len__(self):
#         total_samples = 0
#         for i in range(self.batch_num):
#             if self.is_train:
#                 total_samples += np.load(f'Datax64/train_data_batch{i+1}.npz')['X_left_train'].shape[0]
#             else:
#                 total_samples += np.load(f'Datax64/test_data_batch{i+1}.npz')['X_left_test'].shape[0]
#         return int(np.ceil(total_samples / self.batch_size))
#         # if self.is_train:
#         #     total_samples = sum([np.load(f'test/train_data_batch1.npz')['X_left_train'].shape[0] for i in range(self.batch_num)])
#         # else:
#         #     total_samples = sum([np.load(f'test/test_data_batch1.npz')['X_left_test'].shape[0] for i in range(self.batch_num)])
#         # return int(np.ceil(total_samples / self.batch_size))

# # def __len__(self):
# #     if self.is_train:
# #         total_samples = sum([np.load(f'test/train_data_batch{i+1}.npz')['X_left_train'].shape[0] for i in range(self.batch_num)])
# #     else:
# #         total_samples = sum([np.load(f'test/test_data_batch{i+1}.npz')['X_left_test'].shape[0] for i in range(self.batch_num)])
# #     return int(np.ceil(total_samples / self.batch_size))

#     def __getitem__(self, idx):
#         # Calculate file_idx based on idx
#         file_idx = idx // (self.batch_size * self.batch_num)
#         # Calculate the starting point of the batch within the file
#         inner_idx = idx % (self.batch_size * self.batch_num)
    
#         # Load the correct file
#         if self.is_train:
#             batch_data = np.load(f'Datax64/train_data_batch{file_idx+1}.npz')
#             X_left_batch = batch_data['X_left_train']
#             X_right_batch = batch_data['X_right_train']
#             y_batch = batch_data['y_train']
#         else:
#             batch_data = np.load(f'Datax64/test_data_batch{file_idx+1}.npz')
#             X_left_batch = batch_data['X_left_test']
#             X_right_batch = batch_data['X_right_test']
#             y_batch = batch_data['y_test']
    
#         # Check if batches and labels are equal
#         if len(X_left_batch) != len(X_right_batch) or len(X_left_batch) != len(y_batch):
#             min_len = min(len(X_left_batch), len(X_right_batch), len(y_batch))
    
#             # Trim the batches and labels to be the same size
#             X_left_batch = X_left_batch[:min_len]
#             X_right_batch = X_right_batch[:min_len]
#             y_batch = y_batch[:min_len]
    
#         # Select the correct samples within this file  
#         X_left_batch = X_left_batch[inner_idx * self.batch_size : (inner_idx + 1) * self.batch_size]
#         X_right_batch = X_right_batch[inner_idx * self.batch_size : (inner_idx + 1) * self.batch_size]
#         y_batch = y_batch[inner_idx * self.batch_size : (inner_idx + 1) * self.batch_size]


#     # def __getitem__(self, idx):
#     #     for i in range(self.batch_num):
#     #         if self.is_train:
#     #             batch_data = np.load(f'Datax64/train_data_batch{i+1}.npz')
#     #             X_left_batch = batch_data['X_left_train']
#     #             X_right_batch = batch_data['X_right_train']
#     #             y_batch = batch_data['y_train']
#     #         else:
#     #             batch_data = np.load(f'Datax64/test_data_batch{i+1}.npz')
#     #             X_left_batch = batch_data['X_left_test']
#     #             X_right_batch = batch_data['X_right_test']
#     #             y_batch = batch_data['y_test']

#     #     # if self.is_train:
#     #     #     #{idx+1}
#     #     #     batch_data = np.load(f'test/train_data_batch1.npz')
#     #     #     X_left_batch = batch_data['X_left_train']
#     #     #     X_right_batch = batch_data['X_right_train']
#     #     #     y_batch = batch_data['y_train']
    
#     #     # else:
#     #     #     batch_data = np.load(f'test/test_data_batch1.npz')
#     #     #     X_left_batch = batch_data['X_left_test']
#     #     #     X_right_batch = batch_data['X_right_test']
#     #     #     y_batch = batch_data['y_test']
    
#     #      # Check if batches and labels are equal
#     #      if len(X_left_batch) != len(X_right_batch) or len(X_left_batch) != len(y_batch):
#     #          min_len = min(len(X_left_batch), len(X_right_batch), len(y_batch))
    
#     #          # Trim the batches and labels to be the same size
#     #          X_left_batch = X_left_batch[:min_len]
#     #          X_right_batch = X_right_batch[:min_len]
#     #          y_batch = y_batch[:min_len]
            
#     #     # Select the correct samples within this file  
#     #     X_left_batch = X_left_batch[idx * self.batch_size : (idx + 1) * self.batch_size]
#     #     X_right_batch = X_right_batch[idx * self.batch_size : (idx + 1) * self.batch_size]
#     #     y_batch = y_batch[idx * self.batch_size : (idx + 1) * self.batch_size]
#     #     print('Pre-reshape Batch label shape:', y_batch.shape)



#         # print(X_left_batch.shape)
    
#         # Reshape the input data
#         # Reshape the input data
#         X_left_batch = X_left_batch.reshape(X_left_batch.shape[0], X_left_batch.shape[1], X_left_batch.shape[2], 1)
#         X_right_batch = X_right_batch.reshape(X_right_batch.shape[0], X_right_batch.shape[1], X_right_batch.shape[2], 1)
    

    
#         y_batch = to_categorical(y_batch, num_classes=3)
    
#         print('Batch input shape:', (X_left_batch.shape, X_right_batch.shape))  # Corrected this line
#         print('Batch label shape:', y_batch.shape)
    
#         return [X_left_batch, X_right_batch], y_batch

            
#         #  # Reshape the input data
#         # X_left_batch = X_left_batch.reshape(X_left_batch.shape[0], X_left_batch.shape[1], X_left_batch.shape[2], 1)
#         # X_right_batch = X_right_batch.reshape(X_right_batch.shape[0], X_right_batch.shape[1], X_right_batch.shape[2], 1)

#         # # Add these lines to print out the shapes of your batches
#         # print("Shape of X_left_batch: ", X_left_batch.shape)
#         # print("Shape of X_right_batch: ", X_right_batch.shape)

#         # y_batch = to_categorical(y_batch, num_classes=3)

#         # return [X_left_batch, X_right_batch], y_batch

#         # # Reshape the input data
#         # X_left_batch = X_left_batch.reshape(X_left_batch.shape[0], X_left_batch.shape[1], X_left_batch.shape[2], 1)
#         # X_right_batch = X_right_batch.reshape(X_right_batch.shape[0], X_right_batch.shape[1], X_right_batch.shape[2], 1)
#         # y_batch = to_categorical(y_batch, num_classes=3)

#         # return [X_left_batch, X_right_batch], y_batch

In [15]:
# Define the data generators for train and test data
train_data_generator = DataGenerator(train_num_batches, batch_size, is_train=True)
#test_data_generator = DataGenerator(test_num_batches, batch_size, is_train=False)

# train_steps = len(train_data_generator)
# test_steps = len(test_data_generator)

# # Then, you can use these in your fit function
# history = model.fit(train_data_generator, 
#                     steps_per_epoch=train_steps, 
#                     validation_data=test_data_generator, 
#                     validation_steps=test_steps, 
#                     epochs=your_number_of_epochs)


KeyboardInterrupt: 

In [None]:
# Initialize MirroredStrategy.
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Function to create CRNN model 
    def create_model():
        model = Sequential()
        # Convolutional layers
        model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.001), input_shape=(64, 64, 1)))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.001)))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        # Flatten the output of convolutional layers
        model.add(Flatten())
        # Reshape the output to match LSTM input
        model.add(Reshape((model.output_shape[1], 1)))
        # Recurrent layer
        model.add(LSTM(64, return_sequences=False))
        model.add(Dropout(0.5))
        model.add(Dense(3, activation='softmax'))
        return model
        
    # Define the CRNN model for left and right eye
    left_model = create_model()
    right_model = create_model()

    # Concatenate the outputs of left and right eye models
    concatenated = concatenate([left_model.output, right_model.output])

    # Fully connected layers
    merged_model = Dense(128, activation='relu')(concatenated)
    merged_model = Dense(num_classes, activation='softmax', dtype='float32')(merged_model) # Ensure the output layer is float32

    # Create the final model
    model = Model(inputs=[left_model.input, right_model.input], outputs=merged_model)

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    optimizer = mixed_precision.LossScaleOptimizer(optimizer) # Wrap the optimizer with the LossScaleOptimizer

    loss_fn = tf.keras.losses.CategoricalCrossentropy(reduction=tf.keras.losses.Reduction.SUM)

    # Define metrics within the strategy scope
    train_acc_metric = tf.keras.metrics.CategoricalAccuracy()
    test_acc_metric = tf.keras.metrics.CategoricalAccuracy()
    model.compile(optimizer=optimizer, loss=loss_fn, metrics=[train_acc_metric])

In [None]:
# Define evaluation metrics
test_acc_metric = tf.keras.metrics.CategoricalAccuracy()
test_loss_metric = tf.keras.metrics.Mean()

@tf.function
def test_step(x, y):
    val_logits = model(x, training=False)
    val_loss = loss_fn(y, val_logits)
    return val_loss  # returns the loss
    
@tf.function
def distributed_test_step(dataset_inputs):
    x, y = dataset_inputs
    per_replica_losses = strategy.run(test_step, args=(x, y))
    test_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
    return test_loss

EPOCHS = 5
@tf.function
def distributed_train_step(dataset_inputs):
    per_replica_losses = strategy.run(train_step, args=(dataset_inputs,))
    return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)

@tf.function
def train_step(inputs):
    x, y = inputs
    with tf.GradientTape() as tape:
        try:
            logits = model([x[0], x[1]], training=True)
            # tf.print("Model forward pass successful.")
            # tf.print("Logits shape:", tf.shape(logits))
            # tf.print("Sample logits:", logits[0])
            loss_value = loss_fn(y, logits)
            # tf.print("Loss calculation successful.")
            tf.print("Loss value:", loss_value)
        except Exception as e:
            tf.print("Error during model forward pass or loss calculation:", e)
            logits = None
            loss_value = None
        if loss_value is not None:
            scaled_loss = optimizer.get_scaled_loss(loss_value)
            scaled_grads = tape.gradient(scaled_loss, model.trainable_weights)
            try:
                grads = optimizer.get_unscaled_gradients(scaled_grads)
                # tf.print("Gradient calculation successful.")
            except Exception as e:
                tf.print("Error during gradient calculation:", e)
                grads = None
        else:
            grads = None
        if grads is not None:
            optimizer.apply_gradients(zip(grads, model.trainable_weights))
            train_acc_metric.update_state(y, logits)
            return loss_value
        else:
            tf.print("Skipped gradient update due to error.")
            return tf.constant(0.0)

# def train_step(inputs):
#     x, y = inputs
#     with tf.GradientTape() as tape:
#         logits = model([x[0], x[1]], training=True)
#         loss_value = loss_fn(y, logits)
#         tf.print("Logits shape:", tf.shape(logits))
#         tf.print("Sample logits:", logits[0])
#         tf.print("Loss value:", loss_value)

#     scaled_loss = optimizer.get_scaled_loss(loss_value)
#     scaled_grads = tape.gradient(scaled_loss, model.trainable_weights)
#     grads = optimizer.get_unscaled_gradients(scaled_grads)

#     # print shapes and some values
#     print(f"x shapes: {[i.shape for i in x]}")
#     print(f"x[0] sample values: {x[0][0, :2, :2, 0]}")  # print a small part of x[0]
#     print(f"y shape: {y.shape}, y sample values: {y[:5]}")
#     print(f"logits shape: {logits.shape}, logits sample values: {logits[:5]}")
#     if grads is not None:
#         print(f"grads shapes: {[i.shape for i in grads]}")
#         print(f"grads sample values: {grads[0][0, :2, :2, 0]}")  # print a small part of the first gradient
#     else:
#         print("No gradients")

#     optimizer.apply_gradients(zip(grads, model.trainable_weights))
#     train_acc_metric.update_state(y, logits)
#     return loss_value


#import matplotlib.pyplot as plt

# Function to display a pair of left and right eye images
# def display_sample_images(left_images, right_images, image_index=0):
#     plt.figure(figsize=(10, 5))

#     plt.subplot(1, 2, 1)
#     plt.imshow(left_images[image_index].reshape(height, width), cmap='gray')
#     plt.title('Left Eye')

#     plt.subplot(1, 2, 2)
#     plt.imshow(right_images[image_index].reshape(height, width), cmap='gray')
#     plt.title('Right Eye')

#     plt.show()

#Early stopping
early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=2, 
    verbose=1,
    restore_best_weights=True)

#Checkpoint object

checkpoint_dir = (f'./Checkpoints')
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)

checkpoint.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
    print("Restored from {}".format(manager.latest_checkpoint))
else:
    print("Initializing from scratch.")

# Training Loop
for epoch in range(EPOCHS):
    print("\nStart of epoch %d" % (epoch,))
    #total_samples = train_data_generator._compute_total_samples()
    
    # Display a pair of sample images at the start of each epoch
    x_batch_sample, _ = next(iter(train_data_generator))
    left_eye_images, right_eye_images = x_batch_sample
    #display_sample_images(left_eye_images, right_eye_images)

    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_data_generator):
        start_time = time.time()
        # print("Shape of batch inputs:", x_batch_train[0].shape, x_batch_train[1].shape)  # Assuming x_batch_train is a tuple containing left and right eye images
        # print("Shape of batch labels:", y_batch_train.shape)
        #model.summary()
        print("\nStart of step %d" % (step,))
        # print("Training data shape:", [i.shape for i in x_batch_train])
        # print("Label data shape:", y_batch_train.shape)
        loss_value = distributed_train_step((x_batch_train, y_batch_train))

        # print("Training loss (for one batch) at step %d: %.4f" % (step, float(loss_value)))
        print("Seen so far: %d samples out of 3479580 total samples" % ((step + 1) * batch_size))
        end_time = time.time()
        step_time = end_time - start_time
        steps_per_hour = 3600 / step_time
        print(f'Steps time{step_time}')
        print(f'Steps per hour{steps_per_hour}')
        if (step + 1) % 50 == 0:
            manager.save()


    # Display metrics at the end of each epoch.
    train_acc = train_acc_metric.result()
    print("Training acc over epoch: %.4f" % (float(train_acc),))

    # Start of evaluation
    for x_batch_val, y_batch_val in test_data_generator:
        loss_value = distributed_test_step((x_batch_val, y_batch_val))
        test_acc_metric.update_state(y_batch_val, model(x_batch_val, training=False))
        test_loss_metric.update_state(loss_value)

    val_acc = test_acc_metric.result()
    val_loss = test_loss_metric.result()
    print("Validation acc: %.4f" % (float(val_acc),))
    print("Validation loss: %.4f" % (float(val_loss),))

    test_acc_metric.reset_states()
    test_loss_metric.reset_states()

    model.save('my_model.h5')

    # if (epoch + 1) % 2 == 0:
    # if (epoch + 1) % 2 == 0:
    #     manager.save()
    #     print('Model Saved.')
        
    current_val_loss = float(val_loss)
    if current_val_loss < best_val_loss:
        best_val_loss = current_val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping")
            break  # break the training loop

    # Reset training metrics at the end of each epoch
    train_acc_metric.reset_states()

    

# # Test Loop
# for x_batch_val, y_batch_val in test_data_generator:
#     val_logits, val_loss = test_step(x_batch_val, y_batch_val)
#     test_acc_metric.update_state(y_batch_val, val_logits)
#     test_loss_metric.update_state(val_loss)

# val_acc = test_acc_metric.result()
# val_loss = test_loss_metric.result()
# print(f"Validation acc: {float(val_acc)}")
# print(f"Validation loss: {float(val_loss)}")


# # Training Loop
# for epoch in range(EPOCHS):
#     print("\nStart of epoch %d" % (epoch,))

#     # Iterate over the batches of the dataset.
#     for step, (x_batch_train, y_batch_train) in enumerate(train_data_generator):
#         print("\nStart of step %d" % (step,))
#         loss_value = distributed_train_step((x_batch_train, y_batch_train))

#         print("Training loss (for one batch) at step %d: %.4f" % (step, float(loss_value)))
#         print("Seen so far: %d samples" % ((step + 1) * batch_size))

#     # Display metrics at the end of each epoch.
#     train_acc = train_acc_metric.result()
#     print("Training acc over epoch: %.4f" % (float(train_acc),))

#     # Reset training metrics at the end of each epoch
#     train_acc_metric.reset_states()

#     # Start of evaluation
#     for x_batch_val, y_batch_val in test_data_generator:
#         loss_value = distributed_test_step((x_batch_val, y_batch_val))
#         test_acc_metric.update_state(y_batch_val, model((x_batch_val), training=False))
#         test_loss_metric.update_state(loss_value)

#     val_acc = test_acc_metric.result()
#     val_loss = test_loss_metric.result()
#     print("Validation acc: %.4f" % (float(val_acc),))
#     print("Validation loss: %.4f" % (float(val_loss),))

#     test_acc_metric.resfet_states()
#     test_loss_metric.reset_states()

# # Test Loop
# for x_batch_val, y_batch_val in test_data_generator:
#     val_logits, val_loss = test_step(x_batch_val, y_batch_val)
#     test_acc_metric.update_state(y_batch_val, val_logits)
#     test_loss_metric.update_state(val_loss)

# val_acc = test_acc_metric.result()
# val_loss = test_loss_metric.result()
# print(f"Validation acc: {float(val_acc)}")
# print(f"Validation loss: {float(val_loss)}")

In [None]:
# assuming your checkpoint directory is ./checkpoints/ and the prefix of the saved checkpoints is 'ckpt'
ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model)
manager = tf.train.CheckpointManager(ckpt, './checkpoints/', max_to_keep=3)

# load the latest checkpoint
ckpt.restore(manager.latest_checkpoint)

if manager.latest_checkpoint:
    print("Restored from {}".format(manager.latest_checkpoint))
else:
    print("Initializing from scratch.")

# Start of evaluation
for x_batch_val, y_batch_val in test_data_generator:
    loss_value = distributed_test_step((x_batch_val, y_batch_val))
    test_acc_metric.update_state(y_batch_val, model(x_batch_val, training=False))
    test_loss_metric.update_state(loss_value)

val_acc = test_acc_metric.result()
val_loss = test_loss_metric.result()
print("Validation acc: %.4f" % (float(val_acc),))
print("Validation loss: %.4f" % (float(val_loss),))


In [None]:
# Test Loop
for x_batch_val, y_batch_val in test_data_generator:
    val_logits, val_loss = test_step(x_batch_val, y_batch_val)
    test_acc_metric.update_state(y_batch_val, val_logits)
    test_loss_metric.update_state(val_loss)

val_acc = test_acc_metric.result()
val_loss = test_loss_metric.result()
print(f"Validation acc: {float(val_acc)}")
print(f"Validation loss: {float(val_loss)}")

In [None]:
checkpoint.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
    print("Restored from {}".format(manager.latest_checkpoint))
else:
    print("Initializing from scratch.")


In [10]:
print(model.trainable_variables)

[<AutoCastDistributedVariable dtype=float32 dtype_to_cast_to=float32 inner_variable=MirroredVariable:{
  0: <tf.Variable 'conv2d/kernel:0' shape=(3, 3, 1, 32) dtype=float32, numpy=
array([[[[ 0.13231905,  0.10117723, -0.04830281,  0.05489857,
           0.00032724, -0.09253888,  0.00038309,  0.10465163,
          -0.05856336,  0.1101483 ,  0.04518037, -0.05241653,
           0.11996345, -0.0124535 , -0.05528188,  0.08399296,
          -0.03343274, -0.11645409, -0.09766854,  0.08100729,
           0.0026951 ,  0.05847576,  0.09785855, -0.10890094,
           0.06721714, -0.04285773, -0.10241389, -0.07816841,
          -0.12911044, -0.07484341,  0.1161841 ,  0.10462303]],

        [[ 0.08126086, -0.10559137,  0.11533828,  0.09404066,
          -0.09009574,  0.05213897, -0.13767493,  0.12251504,
          -0.03003795,  0.10996027,  0.00657855,  0.07582499,
           0.0222975 ,  0.02763851, -0.11449368,  0.03348736,
          -0.0924139 ,  0.13020499,  0.03555311,  0.00954044,
          

In [11]:
x_batch_sample, y_batch_sample = next(iter(train_data_generator))
with tf.GradientTape() as tape:
    logits = model(x_batch_sample, training=True)
    loss_value = loss_fn(y_batch_sample, logits)

scaled_loss = optimizer.get_scaled_loss(loss_value)
scaled_grads = tape.gradient(scaled_loss, model.trainable_weights)
grads = optimizer.get_unscaled_gradients(scaled_grads)

for grad, var in zip(grads, model.trainable_variables):
    print(f'Variable: {var.name}, Gradient: {grad}')

2023-06-08 21:47:12.755584: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 10.13GiB (rounded to 10877272064)requested by op Cast
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-06-08 21:47:12.758961: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2023-06-08 21:47:12.759099: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 30, Chunks in use: 30. 7.5KiB allocated for chunks. 7.5KiB in use in bin. 884B client-requested in use in bin.
2023-06-08 21:47:12.759188: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 3, Chunks in use: 1. 1.8KiB allocated for chunks. 512B in use in bin. 512B client-requested in use in bin.
2023-06-08 21:47:12.759330: I tensorflow/tsl/framework/bfc_allocator

ResourceExhaustedError: Exception encountered when calling layer 'model' (type Functional).

{{function_node __wrapped__Cast_device_/job:localhost/replica:0/task:0/device:GPU:0}} failed to allocate memory [Op:Cast]

Call arguments received by layer 'model' (type Functional):
  • inputs=['tf.Tensor(shape=(165974, 128, 128, 1), dtype=float16)', 'tf.Tensor(shape=(165974, 128, 128, 1), dtype=float16)']
  • training=True
  • mask=None

In [11]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 conv2d_input (InputLayer)      [(None, 64, 64, 1)]  0           []                               
                                                                                                  
 conv2d_2_input (InputLayer)    [(None, 64, 64, 1)]  0           []                               
                                                                                                  
 conv2d (Conv2D)                (None, 64, 64, 32)   320         ['conv2d_input[0][0]']           
                                                                                                  
 conv2d_2 (Conv2D)              (None, 64, 64, 32)   320         ['conv2d_2_input[0][0]']         
                                                                                              

In [13]:
data_gen = DataGenerator(batch_num=10, batch_size=10, is_train=True)
sample_batch = next(iter(data_gen))
print("Input data shape:", [i.shape for i in sample_batch[0]])
print("Label data shape:", sample_batch[1].shape)


Pre-reshape Batch label shape: (10, 1)
(10, 64, 64)
Batch input shape: ((10, 64, 64, 1), (10, 64, 64, 1))
Batch label shape: (10, 3)
Input data shape: [(10, 64, 64, 1), (10, 64, 64, 1)]
Label data shape: (10, 3)


In [14]:
print("Sample labels:", sample_batch[1][:5])

Sample labels: [[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]
