## Preparation and Downloads

### Loading Libraries and Packages

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import pickle

import tensorflow as tf

from tensorflow.keras import layers, models
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Optimizer

from tensorflow.keras.metrics import Mean, CategoricalAccuracy
from tensorflow.keras.losses import CategoricalCrossentropy

from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

from tqdm import tqdm
import os

In [2]:
#!pip install tensorboard

In [3]:
%load_ext tensorboard

In [4]:
from tensorflow.keras.callbacks import TensorBoard

In [5]:
# gpu_devices = tf.config.list_physical_devices('GPU')

# if gpu_devices:
#     for gpu in gpu_devices:
#         tf.config.experimental.set_memory_growth(gpu, True)

### Ensuring Reproducibility of the Code

In [6]:
from tensorflow.python.eager import context


def set_reprodicibility(seed=42):

    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for TensorFlow
    tf.random.set_seed(seed)


    tf.config.threading.set_inter_op_parallelism_threads(1)
    _ = tf.Variable([1])


In [7]:
set_reprodicibility()

### Loading CIFAR-10 Data




In [8]:
# Load and preprocess the CIFAR-10 dataset
(train_images, train_labels), (val_images, val_labels) = cifar10.load_data()
train_images, val_images = train_images / 255.0, val_images / 255.0
train_labels, val_labels = to_categorical(train_labels), to_categorical(val_labels)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [9]:
# Convert NumPy arrays to TensorFlow datasets
minibatch_size = 256   # 512 in the paper

train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).batch(minibatch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((val_images, val_labels)).batch(minibatch_size)

### Defining Model Architectures

The suitability of a NN architecture for a particular task, such as CIFAR-10 classification, can depend on various factors including the complexity of the dataset, computational resources available, and the specific requirements of the task.

For CIFAR-10, which consists of small (32x32) color images across 10 classes, the dataset is relatively simple compared to larger datasets like ImageNet. Therefore, simpler architectures can often achieve good performance. VGG-style architectures have been widely used and studied for image classification tasks, including CIFAR-10.

Researchers often experiment with variations of architectures to find the best-performing model. Some commonly used architectures for CIFAR-10 classification include:

- **VGG-like architectures**: Configurations similar to VGG with variations in the number of layers, filter sizes, and depths.
- **ResNet**: Residual Networks, which introduce skip connections to mitigate vanishing gradient problems, have been shown effective for CIFAR-10.
- **DenseNet**: Dense Convolutional Networks connect each layer to every other layer in a feed-forward fashion, promoting feature reuse and denser gradients.

These architectures are often experimented with different hyperparameters such as learning rates, batch sizes, and optimizers to achieve optimal performance. Researchers might also explore techniques like data augmentation, regularization, and learning rate scheduling to improve model performance.

Ultimately, the choice of architecture and configuration often involves experimentation to find the best-performing model for a specific dataset and task.

#### VGG16

In [10]:
class VGGModel:

    def __init__(self, input_shape=(32, 32, 3), num_classes=10):
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.model = self.build_model()

    def build_model(self):
        model = models.Sequential()

        # Block 1
        model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=self.input_shape))
        model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
        model.add(layers.MaxPooling2D((2, 2), strides=(2, 2)))

        # Block 2
        model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
        model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
        model.add(layers.MaxPooling2D((2, 2), strides=(2, 2)))

        # Block 3
        model.add(layers.Conv2D(256, (3, 3), activation='relu', padding='same'))
        model.add(layers.Conv2D(256, (3, 3), activation='relu', padding='same'))
        model.add(layers.Conv2D(256, (3, 3), activation='relu', padding='same'))
        model.add(layers.MaxPooling2D((2, 2), strides=(2, 2)))

        # Block 4
        model.add(layers.Conv2D(512, (3, 3), activation='relu', padding='same'))
        model.add(layers.Conv2D(512, (3, 3), activation='relu', padding='same'))
        model.add(layers.Conv2D(512, (3, 3), activation='relu', padding='same'))
        model.add(layers.MaxPooling2D((2, 2), strides=(2, 2)))

        # Block 5
        model.add(layers.Conv2D(512, (3, 3), activation='relu', padding='same'))
        model.add(layers.Conv2D(512, (3, 3), activation='relu', padding='same'))
        model.add(layers.Conv2D(512, (3, 3), activation='relu', padding='same'))
        model.add(layers.MaxPooling2D((2, 2), strides=(2, 2)))

        # Flatten
        model.add(layers.Flatten())

        # Dense layers
        model.add(layers.Dense(4096, activation='relu'))
        model.add(layers.Dense(4096, activation='relu'))
        model.add(layers.Dense(self.num_classes, activation='softmax'))

        return model

    def compile_model(self, optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy']):
        self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    def summary(self):
        self.model.summary()

    def save(self, directory, name):
        self.model._name = name
        self.model.save(directory+name+'.keras')

#### Simplified VGG Version

The following simplified VGG model has three convolutional blocks with decreasing spatial dimensions followed by two fully connected layers. It should be suitable for training on CIFAR-10 dataset.

In [11]:
class SimpleVGG:

    def __init__(self, input_shape=(32, 32, 3), num_classes=10):
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.model = self.build_model()

    def build_model(self):

        model = Sequential([
            Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=self.input_shape),
            Conv2D(64, (3, 3), activation='relu', padding='same'),
            MaxPooling2D((2, 2)),

            Conv2D(128, (3, 3), activation='relu', padding='same'),
            Conv2D(128, (3, 3), activation='relu', padding='same'),
            MaxPooling2D((2, 2)),

            Conv2D(256, (3, 3), activation='relu', padding='same'),
            Conv2D(256, (3, 3), activation='relu', padding='same'),
            MaxPooling2D((2, 2)),

            Flatten(),
            Dense(512, activation='relu'),
            Dense(self.num_classes, activation='softmax')
        ])
        return model

    def compile_model(self, optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy']):
        self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    def summary(self):
        self.model.summary()

    def save(self, directory, name):
        self.model._name = name
        self.model.save(directory+name+'.keras')

#### Simple CNN

In [12]:
class SimpleCNN:

    def __init__(self, input_shape=(32, 32, 3), num_classes=10):
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.model = self.build_model()

    def build_model(self):

        model = Sequential([
            Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=self.input_shape),
            Conv2D(64, (3, 3), activation='relu', padding='same'),
            MaxPooling2D((2, 2)),

            Flatten(),
            Dense(128, activation='relu'),
            Dense(128, activation='relu'),
            Dense(self.num_classes, activation='softmax')
        ])
        return model

    def compile_model(self, optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy']):
        self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    def summary(self):
        self.model.summary()

    def save(self, directory, name):
        self.model._name = name
        self.model.save(directory+name+'.keras')

## TRAINING (Simple CNN)

In [13]:
# Define the number of runs
num_runs = 1
epochs = 50

tb_log_dir = "logs/fit/simpleCNN_256/" #  TensorBoard logs directory

In [14]:
path_to_saved_models = 'output/models/simpleCNN_256/'  # path to store trained models

# Check if folder exists
if not os.path.exists(path_to_saved_models):
    os.makedirs(path_to_saved_models)         # create a new directory of doesn't exist

### SGD

#### Defining LR schedule, early stopping, tensorboard callback

In [15]:
# # Create an instance of the VGGModel class with SGD optimizer
# model_sgd = SimpleCNN(input_shape=(32, 32, 3), num_classes=10)
# model_sgd.compile_model(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
# model_sgd.summary()

In the SGD-SA paper, they use Scheduled-SGD implementation of SGD. It is quite basic but still rather eﬀective on CIFAR-10 dataset: it uses no momentum/Nesterov acceleration, and the learning rate is set according the following schedule: η =0.1 for ﬁrst 30 epochs, 0.01 for the next 40 epochs, and 0.001 for the ﬁnal 30 epochs.

We have modified it for our SimplifiesVGG: learning rate is 0.1 for the ﬁrst 20 epochs, 0.01 for the next 30 epochs, and 0.001 for the ﬁnal epochs.

In [16]:
# Define the learning rate schedule function
def lr_schedule(epoch):
    if epoch <= 10:
        return 0.1
    elif epoch <= 20:
        return 0.01     # lr defaults to 0.01 in SGD
    else:
        return 0.001

In [17]:
class EarlyStoppingAt1Accuracy(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if logs.get('accuracy') >= 0.99:
            print("\nReached 100% accuracy so cancelling training!")
            self.model.stop_training = True


# # Possible Alternative: stop when accuracy is almost not changing:
#
# early_stopping = EarlyStopping(monitor='accuracy',  # Monitor training accuracy
#                                min_delta=0.01,      # Minimum change in accuracy to qualify as an improvement
#                                patience=20,         # Number of epochs with no improvement after which training will be stopped
#                                verbose=1)           # Print messages about early stopping

In [18]:
# Define Training Parameters
learning_rate_scheduler = LearningRateScheduler(lr_schedule) # Create the LearningRateScheduler callback
early_stopping_callback = EarlyStoppingAt1Accuracy() # Define the EarlyStopping callback


# # Training
# history_sgd = model_sgd.model.fit(train_dataset,
#                                   epochs=epochs,
#                                   validation_data=val_dataset,
#                                   callbacks=[learning_rate_scheduler, early_stopping_callback])

In [19]:
# # Save the model in native Keras format
# model_sgd.save(path_to_saved_models, name='simpleCNN_sgd_'+str(minibatch_size))

# # To load the saved model later:
# # loaded_model = tf.keras.models.load_model(model_full_path)

# # Save the training history
# with open(path_to_saved_models+'simpleCNN_sgd_'+str(minibatch_size)+'_history.pkl', 'wb') as file:
#     pickle.dump(history_sgd.history, file)

In [20]:
# Define custom callback to log metrics to the same directory
class CustomTensorBoard(tf.keras.callbacks.Callback):
    def __init__(self, log_dir):
        super(CustomTensorBoard, self).__init__()
        self.writer = tf.summary.create_file_writer(log_dir)

    def on_epoch_end(self, epoch, logs=None):
        with self.writer.as_default():
            for name, value in logs.items():
                tf.summary.scalar(name, value, step=epoch)
            self.writer.flush()

#### Training Runs

In [21]:
## RUN MULTIPLE TIMES

# Loop for each run
for run in range(num_runs):

    print("-----")
    print(f"Training run {run + 1}/{num_runs}")
    print("-----")

    model_sgd = SimpleCNN(input_shape=(32, 32, 3), num_classes=10)
    model_sgd.compile_model(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])

    # Define TensorBoard log directory for the current run
    log_dir = tb_log_dir+"sgd_run_" + str(run + 1)
    #tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
    custom_tensorboard_callback = CustomTensorBoard(log_dir)

    # Train the model with validation data and log both training and validation metrics
    history_sgd = model_sgd.model.fit(train_dataset,
                            epochs=epochs,
                            validation_data=val_dataset,
                            callbacks=[learning_rate_scheduler, early_stopping_callback, custom_tensorboard_callback])

    # SAVES
    # Save the trained model in native Keras format
    model_sgd.save(path_to_saved_models, name='simpleCNN_sgd_'+str(minibatch_size)+'_run_'+str(run+1))
    # Save the training history
    with open(path_to_saved_models+'simpleCNN_sgd_'+str(minibatch_size)+'_run_'+str(run+1)+'_history.pkl', 'wb') as file:
        pickle.dump(history_sgd.history, file)

-----
Training run 1/1
-----
Epoch 1/50

KeyboardInterrupt: 

### SGD-SA (attempt to re-produce the known algorithm)

#### 1. Defining an Acceptance Criterion

In [None]:
def accept_update(old_loss, new_loss, temperature):

    # Metropolis acceptance criteria
    worsening = new_loss - old_loss
    try:
        probability = np.exp((-worsening) / temperature)   # check for an extremely large number (when the temperature gets small)
    except RuntimeWarning as e:
        print("Caught a RuntimeWarning:", e)
        is_worse = worsening > 0
        return True, False, 1

    #probability = np.exp((-worsening) / temperature)
    random_number = random.random()

    is_accepted = random_number < probability
    is_worse = worsening > 0

    return is_accepted, is_worse, probability

#### 2. Defining a Generator Function to Pick LR based on 'lr_options'

In [None]:
# Define a generator function to pick the learning rate based on lr_options
def pick_lr(lr_options, seed=123):
    #np.random.seed(seed)
    while True:

        if isinstance(lr_options, list):
            yield np.random.choice(lr_options)

        elif isinstance(lr_options, dict):
            lr_min = lr_options['min_lr']
            lr_max = lr_options['max_lr']
            yield np.random.uniform(lr_min, lr_max)

        else:
            raise ValueError("lr_options must be either a list or a dictionary.")

#### 3. Defining a Customized Training Loop for SGD-SA

In [None]:
###########################################
### Customized Training Loop for SGD-SA ### !!!! MODIFICATION WITH TENSORBOARD
###########################################

def run_SGD_SA(model, model_name, train_dataset, val_dataset, lr_options,
               epochs=100, temperature=1, cooling=0.95, loss_fn=tf.keras.losses.CategoricalCrossentropy(),
               output_folder='output/models/'):

    # Initialize metrics objects for both training and validation
    train_loss_metric = tf.keras.metrics.Mean()  # to average training loss over all batches within the epoch
    train_accuracy_metric = tf.keras.metrics.CategoricalAccuracy()
    val_loss_metric = tf.keras.metrics.Mean()  # to average training loss over all batches within the epoch
    val_accuracy_metric = tf.keras.metrics.CategoricalAccuracy()

    # Initialize empty lists to store loss and accuracy values (to construct "history")
    train_loss_history = []
    train_accuracy_history = []
    val_loss_history = []
    val_accuracy_history = []
    temperature_history = []
    rejected_history = []
    accepted_worse_history = []
    probability_history = []

    # Extract the batch size from the first batch (will be used to name the trained model while saving)
    first_batch = next(iter(train_dataset.take(1).as_numpy_iterator()))
    batch_size_tensor = tf.shape(first_batch[0])[0]
    batch_size_value = batch_size_tensor.numpy()
    print("batch_size:", batch_size_value)

    # Create the pick_lr generator
    lr_generator = pick_lr(lr_options) #, seed=42)

    ##### MAIN TRAINING LOOP #####
    with summary_writer.as_default():

        for epoch in range(1, epochs + 1):

            # Reset metrics for the new epoch
            train_loss_metric.reset_states()
            train_accuracy_metric.reset_states()
            val_loss_metric.reset_states()
            val_accuracy_metric.reset_states()

            # Reset epoch-level counts
            accepted = 0
            rejected = 0
            accepted_worse = 0
            probabilities = []

            ##### TRAINING #####
            for batch, (x_batch, y_batch) in tqdm(enumerate(train_dataset), desc="batches"):

                with tf.GradientTape() as tape:
                    # Forward pass
                    predictions = model.model(x_batch)
                    # Compute the loss
                    loss = loss_fn(y_batch, predictions)

                # Save the current weights and loss as "old"
                old_weights = model.model.trainable_variables
                old_loss = loss.numpy()
                #print("old_loss", old_loss)

                # Compute Gradients
                gradients = tape.gradient(loss, model.model.trainable_variables)

                # Select a Learning Rate
                random_lr = next(lr_generator)

                # Update weights using gradient descent
                for var, grad in zip(model.model.trainable_variables, gradients):
                    var.assign_sub(random_lr * grad)

                # Compute the new Loss (after the weights update)
                predictions_after_update = model.model(x_batch)
                new_loss = loss_fn(y_batch, predictions_after_update)
                # print("New Loss after Weight Update:", new_loss.numpy())


                # Check acceptance criteria
                is_accepted, is_worse, probability = accept_update(old_loss, new_loss, temperature)
                probabilities.append(probability)    # save the probability values for the history
                if not is_accepted:
                    # Reverting update
                    model.model.set_weights(old_weights)
                    # Update Criterion Rejection Count
                    rejected += 1
                    # Update Epoch Metrics
                    train_loss_metric.update_state(old_loss)
                    train_accuracy_metric.update_state(y_batch, predictions)
                else:
                    # Update Criterion Acceptance Count
                    accepted += 1
                    # Update Epoch Metrics
                    train_loss_metric.update_state(new_loss)
                    train_accuracy_metric.update_state(y_batch, predictions_after_update)
                    if is_worse:
                        accepted_worse +=1
                        # print(f"Epoch: {epoch + 1}, Batch: {batch}, Previous Loss: {loss.numpy()}, New Loss: {new_loss.numpy()}, LR: {random_lr}, Temperature: {temperature}, A/R: {accepted}/{rejected}")

                # # Print progress
                # if batch % 500 == 0:
                #     print(f"Epoch: {epoch + 1}, Batch: {batch}, Previous Loss: {loss.numpy()}, New Loss: {new_loss.numpy()}, LR: {random_lr}, Temperature: {temperature}, A/R: {accepted}/{rejected}")


                # If update of the Temperature - every nth batch
                # if (batch + 1) % 10 == 0:  # +1 to start from 1
                #     temperature = cooling * temperature

            ##### VALIDATION #####
            for batch, (x_val_batch, y_val_batch) in enumerate(val_dataset):
                # Forward pass
                val_predictions = model.model(x_val_batch)
                # Compute the validation loss
                val_loss = loss_fn(y_val_batch, val_predictions)
                # Update validation metrics
                val_loss_metric.update_state(val_loss)
                val_accuracy_metric.update_state(y_val_batch, val_predictions)


            ##### EPOCH RESULTS #####
            epoch_loss = train_loss_metric.result().numpy()
            epoch_accuracy = train_accuracy_metric.result().numpy()
            epoch_val_loss = val_loss_metric.result().numpy()
            epoch_val_accuracy = val_accuracy_metric.result().numpy()


            # Store the epoch-level metrics to history lists
            train_loss_history.append(epoch_loss)
            train_accuracy_history.append(epoch_accuracy)
            val_loss_history.append(epoch_val_loss)
            val_accuracy_history.append(epoch_val_accuracy)
            temperature_history.append(temperature)
            rejected_history.append(rejected)
            accepted_worse_history.append(accepted_worse)
            probability_history.append(probabilities)

            # Write metrics to the TensorBoard log
            tf.summary.scalar('loss', epoch_loss, step=epoch)
            tf.summary.scalar('accuracy', epoch_accuracy, step=epoch)
            tf.summary.scalar('val_loss', epoch_val_loss, step=epoch)
            tf.summary.scalar('val_accuracy', epoch_val_accuracy, step=epoch)
            summary_writer.flush() # Flush the summary writer

            # Print epoch results
            print(f"Epoch {epoch}/{epochs}")
            print(f"loss: {epoch_loss:.4f} - accuracy: {epoch_accuracy:.4f} - val_loss: {epoch_val_loss:.4f} - val_accuracy: {epoch_val_accuracy:.4f} - temperature: {temperature:.4f} - rejected: {rejected}/{accepted+rejected} - accepted_worse: {accepted_worse}")

            # Early stopping check
            if epoch > 2:
                recent_accuracy = sum(train_accuracy_history[-2:]) / 2  # Average accuracy over the last 2 epochs
                if recent_accuracy >= 0.99:  # Stop training if training accuracy exceeds 0.99
                    print("Training accuracy reached 0.99. Stopping training.")
                    break


            # Update Temperature in the end of the Epoch
            temperature = cooling * temperature

    ##### Construct the history object #####
    history = {
        'loss': train_loss_history,
        'accuracy': train_accuracy_history,
        'val_loss': val_loss_history,
        'val_accuracy': val_accuracy_history,
        'temperature': temperature_history,
        'rejected': rejected_history,
        'accepted_worse': accepted_worse_history,
        'probabilities': probability_history
    }

    ##### SAVES #####

    # Save the trained model in native Keras format
    model.save(output_folder, model_name+'_'+str(batch_size_value)+'.keras')
    # To load the saved model later:
    # loaded_model = tf.keras.models.load_model(model_save_path)

    # Save the training history
    with open(output_folder+model_name+'_'+str(batch_size_value)+'_history.pkl', 'wb') as file:
        pickle.dump(history, file)

**Important remark for the custom training defined above:**

For tracking loss values during training, we use `tf.keras.metrics.Mean()`. This metric calculates the mean of the values observed across all batches.

`tf.keras.metrics.CategoricalCrossentropy()` is not suitable for tracking loss values during training. Instead, it's used as a loss function to compute the categorical cross-entropy loss between the true labels and the predicted probabilities.

In summary:
* Use `tf.keras.metrics.Mean()` to track the loss values during training.
* Use `tf.keras.losses.CategoricalCrossentropy()` as the loss function during training.

#### Training Runs for SGD-SA

In [None]:
# Define Training Parameters FOR BOTH: SGD-SA and SGD-SA modified
# epochs = 30

temperature = 1
cooling = 0.95  # in the paper: 0.8

lr_options =  [0.1, 0.07, 0.05, 0.03, 0.01, 0.007, 0.005, 0.003, 0.001]  # lr defaults to 0.01 in SGD

In [None]:
# Example:

# run_SGD_SA(model_sgd_sa, 'simpleCNN_SGD_SA', train_dataset, val_dataset, lr_options,
#           epochs, temperature, cooling)

In [None]:
### TESTING MULTIPLE RUNS

# Loop for each run
for run in range(num_runs):

    print("-----")
    print(f"Training run {run + 1}/{num_runs}")
    print("-----")

    # Compile the model
    model_sgd_sa = SimpleCNN(input_shape=(32, 32, 3), num_classes=10)


    # Create a summary writer for each run
    summary_writer = tf.summary.create_file_writer(tb_log_dir + "sgd_sa_" + f'run_{run + 1}')

    # Run training
    run_SGD_SA(model_sgd_sa, 'simpleCNN_SGD_SA_'+str(run+1), train_dataset, val_dataset,
               lr_options, epochs, temperature, cooling)

    print("==============")

### SGD-SA with Modified LR Picking (random from uniform distribution)

#### Defining Parameters for SGD-SA modified

In [None]:
# temperature = 1
# cooling = 0.95     # in the paper: 0.8


lr_options = {
    'min_lr': np.min(lr_options),
    'max_lr': np.max(lr_options)
}

#### Training Runs for SGD-SA modified

In [None]:
### TESTING MULTIPLE RUNS

# Loop for each run
for run in range(num_runs):

    print("-----")
    print(f"Training run {run + 1}/{num_runs}")
    print("-----")

    # Compile the model
    model_modified = SimpleCNN(input_shape=(32, 32, 3), num_classes=10)


    # Create a summary writer for each run
    summary_writer = tf.summary.create_file_writer(tb_log_dir + "sgd_sa_modified_" + f'run_{run + 1}')

    # Run training
    run_SGD_SA(model_modified, 'simpleCNN_SGD_SA_modified'+str(run+1), train_dataset, val_dataset,
               lr_options, epochs, temperature, cooling)

    print("==============")

### SGD-SA with Modified LR Picking + Momentum

#### Defining function for SGD-SA with Modified LR Picking + Momentum

In [None]:
###########################################
### Customized Training Loop for SGD-SA ###
###########################################
def run_SGD_SA_WITH_MOMENTUM(model, model_name, train_dataset, val_dataset, lr_options,
               epochs=100, temperature=1, cooling=0.95, loss_fn=tf.keras.losses.CategoricalCrossentropy(),
               output_folder='output/models/'):

    # Initialize metrics objects for both training and validation
    train_loss_metric = tf.keras.metrics.Mean()  # to average training loss over all batches within the epoch
    train_accuracy_metric = tf.keras.metrics.CategoricalAccuracy()
    val_loss_metric = tf.keras.metrics.Mean()  # to average training loss over all batches within the epoch
    val_accuracy_metric = tf.keras.metrics.CategoricalAccuracy()

    # Initialize empty lists to store loss and accuracy values (to construct "history")
    train_loss_history = []
    train_accuracy_history = []
    val_loss_history = []
    val_accuracy_history = []
    temperature_history = []
    rejected_history = []
    accepted_worse_history = []
    probability_history = []

    # Extract the batch size from the first batch (will be used to name the trained model while saving)
    first_batch = next(iter(train_dataset.take(1).as_numpy_iterator()))
    batch_size_tensor = tf.shape(first_batch[0])[0]
    batch_size_value = batch_size_tensor.numpy()
    print("batch_size:", batch_size_value)

    # Create the pick_lr generator
    lr_generator = pick_lr(lr_options) #, seed=42)

    # NEW
    velocities = [tf.Variable(tf.zeros_like(var), trainable=False) for var in model.model.trainable_variables]
    beta = 0.95  # Momentum parameter


    ##### MAIN TRAINING LOOP #####
    with summary_writer.as_default():

        for epoch in range(1, epochs + 1):

            # Reset metrics for the new epoch
            train_loss_metric.reset_states()
            train_accuracy_metric.reset_states()
            val_loss_metric.reset_states()
            val_accuracy_metric.reset_states()

            # Reset epoch-level counts
            accepted = 0
            rejected = 0
            accepted_worse = 0
            probabilities = []

            ##### TRAINING #####
            for batch, (x_batch, y_batch) in tqdm(enumerate(train_dataset), desc="batches"):

                with tf.GradientTape() as tape:
                    # Forward pass
                    predictions = model.model(x_batch)
                    # Compute the loss
                    loss = loss_fn(y_batch, predictions)

                # Save the current weights and loss as "old"
                old_weights = model.model.trainable_variables
                old_loss = loss.numpy()
                #print("old_loss", old_loss)
                # NEW
                old_velocities = [tf.identity(vel) for vel in velocities]  # Store velocities for possible reversion


                # Compute Gradients
                gradients = tape.gradient(loss, model.model.trainable_variables)

                # Select a Learning Rate
                random_lr = next(lr_generator)

                # NEW Update velocities using momentum
                for vel, grad in zip(velocities, gradients):
                    vel.assign(beta * vel + (1 - beta) * grad)

                # Update weights using gradient descent
                # for var, grad in zip(model.model.trainable_variables, gradients):
                #     var.assign_sub(random_lr * grad)

                # Update weights using velocities
                for var, vel in zip(model.model.trainable_variables, velocities):
                    var.assign_sub(random_lr * vel)


                # Compute the new Loss (after the weights update)
                predictions_after_update = model.model(x_batch)
                new_loss = loss_fn(y_batch, predictions_after_update)
                # print("New Loss after Weight Update:", new_loss.numpy())


                # Check acceptance criteria
                is_accepted, is_worse, probability = accept_update(old_loss, new_loss, temperature)
                probabilities.append(probability)    # save the probability values for the history
                if not is_accepted:
                # Reverting weights and velocities
                    model.model.set_weights(old_weights)
                    for vel, old_vel in zip(velocities, old_velocities):  #NEW - need to revert velocities as well
                        vel.assign(old_vel)
                    # Update Criterion Rejection Count
                    rejected += 1
                    # Update Epoch Metrics
                    train_loss_metric.update_state(old_loss)
                    train_accuracy_metric.update_state(y_batch, predictions)
                else:
                    # Update Criterion Acceptance Count
                    accepted += 1
                    # Update Epoch Metrics
                    train_loss_metric.update_state(new_loss)
                    train_accuracy_metric.update_state(y_batch, predictions_after_update)
                    if is_worse:
                        accepted_worse +=1
                        # print(f"Epoch: {epoch + 1}, Batch: {batch}, Previous Loss: {loss.numpy()}, New Loss: {new_loss.numpy()}, LR: {random_lr}, Temperature: {temperature}, A/R: {accepted}/{rejected}")

                # # Print progress
                # if batch % 500 == 0:
                #     print(f"Epoch: {epoch + 1}, Batch: {batch}, Previous Loss: {loss.numpy()}, New Loss: {new_loss.numpy()}, LR: {random_lr}, Temperature: {temperature}, A/R: {accepted}/{rejected}")


                # If update of the Temperature - every nth batch
                # if (batch + 1) % 10 == 0:  # +1 to start from 1
                #     temperature = cooling * temperature

            ##### VALIDATION #####
            for batch, (x_val_batch, y_val_batch) in enumerate(val_dataset):
                # Forward pass
                val_predictions = model.model(x_val_batch)
                # Compute the validation loss
                val_loss = loss_fn(y_val_batch, val_predictions)
                # Update validation metrics
                val_loss_metric.update_state(val_loss)
                val_accuracy_metric.update_state(y_val_batch, val_predictions)



            ##### EPOCH RESULTS #####
            epoch_loss = train_loss_metric.result().numpy()
            epoch_accuracy = train_accuracy_metric.result().numpy()
            epoch_val_loss = val_loss_metric.result().numpy()
            epoch_val_accuracy = val_accuracy_metric.result().numpy()


            # Store the epoch-level metrics to history lists
            train_loss_history.append(epoch_loss)
            train_accuracy_history.append(epoch_accuracy)
            val_loss_history.append(epoch_val_loss)
            val_accuracy_history.append(epoch_val_accuracy)
            temperature_history.append(temperature)
            rejected_history.append(rejected)
            accepted_worse_history.append(accepted_worse)
            probability_history.append(probabilities)

            # Write metrics to the TensorBoard log
            tf.summary.scalar('loss', epoch_loss, step=epoch)
            tf.summary.scalar('accuracy', epoch_accuracy, step=epoch)
            tf.summary.scalar('val_loss', epoch_val_loss, step=epoch)
            tf.summary.scalar('val_accuracy', epoch_val_accuracy, step=epoch)
            summary_writer.flush() # Flush the summary writer

            # Print epoch results
            print(f"Epoch {epoch}/{epochs}")
            print(f"loss: {epoch_loss:.4f} - accuracy: {epoch_accuracy:.4f} - val_loss: {epoch_val_loss:.4f} - val_accuracy: {epoch_val_accuracy:.4f} - temperature: {temperature:.4f} - rejected: {rejected}/{accepted+rejected} - accepted_worse: {accepted_worse}")

            # Early stopping check
            if epoch > 2:
                recent_accuracy = sum(train_accuracy_history[-2:]) / 2  # Average accuracy over the last 2 epochs
                if recent_accuracy >= 0.99:  # Stop training if training accuracy exceeds 0.99
                    print("Training accuracy reached 0.99. Stopping training.")
                    break


            # Update Temperature in the end of the Epoch
            temperature = cooling * temperature


    ##### Construct the history object #####
    history = {
        'loss': train_loss_history,
        'accuracy': train_accuracy_history,
        'val_loss': val_loss_history,
        'val_accuracy': val_accuracy_history,
        'temperature': temperature_history,
        'rejected': rejected_history,
        'accepted_worse': accepted_worse_history,
        'probabilities': probability_history
    }

    ##### SAVES #####

    # Save the trained model in native Keras format
    model.save(output_folder, model_name+'_'+str(batch_size_value)+'.keras')
    # To load the saved model later:
    # loaded_model = tf.keras.models.load_model(model_save_path)

    # Save the training history
    with open(output_folder+model_name+'_'+str(batch_size_value)+'_history.pkl', 'wb') as file:
        pickle.dump(history, file)

#### Setting Parameters for SGD-SA with Modified LR Picking + Momentum

In [None]:
# temperature = 1
# cooling = 0.8     # in the paper: 0.8

# lr_options = {
#     'min_lr': np.min(lr_options),
#     'max_lr': np.max(lr_options)
# }

lr_options

#### Training Runs for SGD-SA with Modified LR Picking + Momentum

In [None]:
### TESTING MULTIPLE RUNS

# Loop for each run
for run in range(num_runs):

    print("-----")
    print(f"Training run {run + 1}/{num_runs}")
    print("-----")

    # Compile the model
    model = SimpleCNN(input_shape=(32, 32, 3), num_classes=10)


    # Create a summary writer for each run
    summary_writer = tf.summary.create_file_writer(tb_log_dir + "sgd_sa_modified_with_momentum_" + f'run_{run + 1}')

    # Run training of SGD-SA with momentum
    run_SGD_SA_WITH_MOMENTUM(model, 'simpleCNN_SGD_SA_modified_momentum_'+str(run+1), train_dataset, val_dataset,
               lr_options, epochs, temperature, cooling)

    print("==============")

### SGD-SA utilizing Accuracy

#### Defining Accuracy and Acceptance Criteria based on Accuracy

In [None]:
def accuracy_fn(y_true, y_pred):
    # Convert predicted probabilities to predicted class labels
    y_pred_labels = tf.argmax(y_pred, axis=1)
    # Convert true labels to integer format (assuming one-hot encoded)
    y_true_labels = tf.argmax(y_true, axis=1)
    # Compare predicted labels with true labels
    correct_predictions = tf.equal(y_true_labels, y_pred_labels)
    # Compute accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
    return accuracy

In [None]:
def accept_update__acc(old_accuracy, new_accuracy, temperature):

    # Metropolis acceptance criteria
    worsening = old_accuracy - new_accuracy # if >0 then the neigbour solution is worse
    try:
        probability = np.exp((-worsening) / temperature)   # check for an extremely large number (when the temperature gets small)
    except RuntimeWarning as e:
        print("Caught a RuntimeWarning:", e)
        is_worse = worsening > 0
        return True, False, 1

    #probability = np.exp((-worsening) / temperature)
    random_number = random.random()

    is_accepted = random_number < probability
    is_worse = worsening > 0

    return is_accepted, is_worse, probability

#### Defining function for SGD-SA utilizing accuracy

In [None]:
def run_SGD_SA__acc(model, model_name, train_dataset, val_dataset, lr_options,
               epochs=100, temperature=1, cooling=0.95, loss_fn=tf.keras.losses.CategoricalCrossentropy(),
               output_folder='output/models/'):

    # Initialize metrics objects for both training and validation
    train_loss_metric = tf.keras.metrics.Mean()  # to average training loss over all batches within the epoch
    train_accuracy_metric = tf.keras.metrics.CategoricalAccuracy()
    val_loss_metric = tf.keras.metrics.Mean()  # to average training loss over all batches within the epoch
    val_accuracy_metric = tf.keras.metrics.CategoricalAccuracy()

    # Initialize empty lists to store loss and accuracy values (to construct "history")
    train_loss_history = []
    train_accuracy_history = []
    val_loss_history = []
    val_accuracy_history = []
    temperature_history = []
    rejected_history = []
    accepted_worse_history = []
    probability_history = []

    # Extract the batch size from the first batch (will be used to name the trained model while saving)
    first_batch = next(iter(train_dataset.take(1).as_numpy_iterator()))
    batch_size_tensor = tf.shape(first_batch[0])[0]
    batch_size_value = batch_size_tensor.numpy()
    print("batch_size:", batch_size_value)

    # Create the pick_lr generator
    lr_generator = pick_lr(lr_options) #, seed=42)

    ##### MAIN TRAINING LOOP #####
    with summary_writer.as_default():

        for epoch in range(1, epochs + 1):

            # Reset metrics for the new epoch
            train_loss_metric.reset_states()
            train_accuracy_metric.reset_states()
            val_loss_metric.reset_states()
            val_accuracy_metric.reset_states()

            # Reset epoch-level counts
            accepted = 0
            rejected = 0
            accepted_worse = 0
            probabilities = []

            ##### TRAINING #####
            for batch, (x_batch, y_batch) in tqdm(enumerate(train_dataset), desc="batches"):

                with tf.GradientTape() as tape:
                    # Forward pass
                    predictions = model.model(x_batch)
                    # Compute the loss
                    loss = loss_fn(y_batch, predictions)
                    # Compute the accuracy
                    accuracy = accuracy_fn(y_batch, predictions)

                # Save the current weights and loss as "old"
                old_weights = model.model.trainable_variables
                old_loss = loss.numpy()
                old_accuracy = accuracy.numpy()
                #print("old_loss", old_loss)

                # Compute Gradients
                gradients = tape.gradient(loss, model.model.trainable_variables)

                # Select a Learning Rate
                random_lr = next(lr_generator)

                # Update weights using gradient descent
                for var, grad in zip(model.model.trainable_variables, gradients):
                    var.assign_sub(random_lr * grad)

                # Compute the new Loss (after the weights update)
                predictions_after_update = model.model(x_batch)
                new_loss = loss_fn(y_batch, predictions_after_update)
                new_accuracy = accuracy_fn(y_batch, predictions_after_update)
                # print("New Loss after Weight Update:", new_loss.numpy())

                # Check acceptance criteria !!! BASED ON ACCURACY
                #is_accepted, is_worse, probability = accept_update(old_loss, new_loss, temperature)
                is_accepted, is_worse, probability = accept_update__acc(old_accuracy, new_accuracy, temperature)
                probabilities.append(probability)    # save the probability values for the history

                if not is_accepted:
                    # Reverting update
                    model.model.set_weights(old_weights)
                    # Update Criterion Rejection Count
                    rejected += 1
                    # Update Epoch Metrics
                    train_loss_metric.update_state(old_loss)
                    train_accuracy_metric.update_state(y_batch, predictions)
                else:
                    # Update Criterion Acceptance Count
                    accepted += 1
                    # Update Epoch Metrics
                    train_loss_metric.update_state(new_loss)
                    train_accuracy_metric.update_state(y_batch, predictions_after_update)
                    if is_worse:
                        accepted_worse +=1
                        # print(f"Epoch: {epoch + 1}, Batch: {batch}, Previous Loss: {loss.numpy()}, New Loss: {new_loss.numpy()}, LR: {random_lr}, Temperature: {temperature}, A/R: {accepted}/{rejected}")

                # # Print progress
                # if batch % 500 == 0:
                #     print(f"Epoch: {epoch + 1}, Batch: {batch}, Previous Loss: {loss.numpy()}, New Loss: {new_loss.numpy()}, LR: {random_lr}, Temperature: {temperature}, A/R: {accepted}/{rejected}")


                # If update of the Temperature - every nth batch
                # if (batch + 1) % 10 == 0:  # +1 to start from 1
                #     temperature = cooling * temperature


            ##### VALIDATION #####
            for batch, (x_val_batch, y_val_batch) in enumerate(val_dataset):
                # Forward pass
                val_predictions = model.model(x_val_batch)
                # Compute the validation loss
                val_loss = loss_fn(y_val_batch, val_predictions)
                # Update validation metrics
                val_loss_metric.update_state(val_loss)
                val_accuracy_metric.update_state(y_val_batch, val_predictions)


            ##### EPOCH RESULTS #####

            epoch_loss = train_loss_metric.result().numpy()
            epoch_accuracy = train_accuracy_metric.result().numpy()
            epoch_val_loss = val_loss_metric.result().numpy()
            epoch_val_accuracy = val_accuracy_metric.result().numpy()


            # Store the epoch-level metrics to history lists
            train_loss_history.append(epoch_loss)
            train_accuracy_history.append(epoch_accuracy)
            val_loss_history.append(epoch_val_loss)
            val_accuracy_history.append(epoch_val_accuracy)
            temperature_history.append(temperature)
            rejected_history.append(rejected)
            accepted_worse_history.append(accepted_worse)
            probability_history.append(probabilities)

            # Write metrics to the TensorBoard log
            tf.summary.scalar('loss', epoch_loss, step=epoch)
            tf.summary.scalar('accuracy', epoch_accuracy, step=epoch)
            tf.summary.scalar('val_loss', epoch_val_loss, step=epoch)
            tf.summary.scalar('val_accuracy', epoch_val_accuracy, step=epoch)
            summary_writer.flush() # Flush the summary writer

            # Print epoch results
            print(f"Epoch {epoch}/{epochs}")
            print(f"loss: {epoch_loss:.4f} - accuracy: {epoch_accuracy:.4f} - val_loss: {epoch_val_loss:.4f} - val_accuracy: {epoch_val_accuracy:.4f} - temperature: {temperature:.4f} - rejected: {rejected}/{accepted+rejected} - accepted_worse: {accepted_worse}")

            # Early stopping check
            if epoch > 2:
                recent_accuracy = sum(train_accuracy_history[-2:]) / 2  # Average accuracy over the last 2 epochs
                if recent_accuracy >= 0.99:  # Stop training if training accuracy exceeds 0.99
                    print("Training accuracy reached 0.99. Stopping training.")
                    break


            # Update Temperature in the end of the Epoch
            temperature = cooling * temperature



    ##### Construct the history object #####
    history = {
        'loss': train_loss_history,
        'accuracy': train_accuracy_history,
        'val_loss': val_loss_history,
        'val_accuracy': val_accuracy_history,
        'temperature': temperature_history,
        'rejected': rejected_history,
        'accepted_worse': accepted_worse_history,
        'probabilities': probability_history
    }

    ##### SAVES #####

    # Save the trained model in native Keras format
    model.save(output_folder, model_name+'_'+str(batch_size_value)+'.keras')
    # To load the saved model later:
    # loaded_model = tf.keras.models.load_model(model_save_path)

    # Save the training history
    with open(output_folder+model_name+'_'+str(batch_size_value)+'_history.pkl', 'wb') as file:
        pickle.dump(history, file)

#### Training Runs for SGD-SA Utilizing Accuracy

In [None]:
### TESTING MULTIPLE RUNS

# Loop for each run
for run in range(num_runs):

    print("-----")
    print(f"Training run {run + 1}/{num_runs}")
    print("-----")

    # Compile the model
    model = SimpleCNN(input_shape=(32, 32, 3), num_classes=10)


    # Create a summary writer for each run
    summary_writer = tf.summary.create_file_writer(tb_log_dir + "sgd_sa_modified__acc_" + f'run_{run + 1}')

    # Run training of SGD-SA with momentum
    run_SGD_SA__acc(model, 'simpleCNN_SGD_SA_modified__acc_'+str(run+1), train_dataset, val_dataset,
               lr_options, epochs, temperature, cooling)

    print("==============")

## Visualizing Results with TensorBoard

In [None]:
%tensorboard --logdir logs/fit/simpleCNN_256/

In [None]:
# Open TensorBoard in web browser:
# http://localhost:6006/

## Zip folders containing output artifacts

In [None]:
import zipfile
import os

# Function to zip a directory
def zip_dir(folder_path, zip_name):
    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(folder_path, '..')))

In [None]:
# Create zip files of the desired directories
zip_dir("logs/", 'logs.zip')
zip_dir("output/", 'output.zip')

In [None]:
### TO DO:


## FIXES:

# 0. !!!Re-train and Visualize results on graphs

# 1. Check Reproducibility! (current seeds are not enough!)

# 2. Fix Temperature values after cooling - remove insignificant digits

# 3. Think about "RuntimeWarning: overflow encountered in exp probability = np.exp((-worsening) / temperature)"



## METHODOLOGY IDEAS:

# 1. Try modifying the acceptance criterion, for example, to make it similar to fast SA

# 2. Try calculating 2 neighbors at a time and taking the best one (analyze results afterward)

# 3. Modification of LR picking: Try generating LR as normally distributed around default 0.01

# 4. Try using 2 two diﬀerent objective functions at training time:
# one diﬀerentiable to compute the gradient (and hence a set of potentially good moves),
# and another completely generic (possibly black box) for the SA acceptance/rejection test
# —the latter intended to favor simple/robust solutions that are likely to generalize well.

# 6. Whatever we end up with, run multiple times for the "statistical significance" of the results