In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras import Sequential
import matplotlib.pyplot as plt
import time

# Problem 3
Implement a convolutional neural network with the following specifications.
• Input: 1-channel input, size 28x28  
• Convolution layer: Convolution kernel size is (3, 3) with stride as 1. Input channels - 1; Output channels - 20 nodes  
• ReLU activation function  
• Max-pool: 2x2 max pool  
• Dropout layer with probability p = 0.50  
• Flatten input for feed to fully connected layers  
• Fully connected layer 1: flattened input with bias; output - 128 nodes  
• ReLU activation function  
• Dropout layer with probability p = 0.50  
• Fully connected layer 2: input - 128 nodes; output - 10 nodes  
• Softmax activation function  
• Use cross entropy as loss function  

For this problem, we will be experimenting with a variety of parameters.
First, train using SGD as the optimizer and mini batches of size 32. Plot the cumulative training loss and accuracy for every epoch. Once training is complete, apply the learned model to the test set and report the testing accuracy.
Second, train your network using mini batch sizes of [32, 64, 96, 128] and plot the convergence run time vs mini batch sizes for each of the following optimizers: SGD, Adagrad, and Adam. You should report 3 figures, one for each optimizer where each figure has mini batch size on the x-axis and the convergence run time on the y-axis.

In [None]:
def cnn():
  mnist = tf.keras.datasets.mnist
  (X_train, y_train), (X_test, y_test) = mnist.load_data()

  # Rescale
  X_train = X_train / 255.0
  X_test = X_test / 255.0
  X_train = X_train.reshape((-1,28,28,1))
  X_test = X_test.reshape((-1,28,28,1))

  # Optimizer
  # Best eta 0.01: ~0.95
  # eta 0.001: ~0.91
  SGD = tf.optimizers.SGD(learning_rate = 0.01)
  
  # Best eta 0.01: ~0.96
  # eta 0.001: ~0.11
  Adagrad = tf.optimizers.Adagrad(learning_rate= 0.01)
  
  # Best eta 0.009 : ~0.95
  # eta 0.001: ~ 0.11
  Adam = tf.optimizers.Adam(learning_rate = 0.009)
  opts = [SGD, Adagrad, Adam]
  opt_names = ["SGD", "Adagrad", "Adam"]
  batch = [32, 64, 96, 128]

  
  # Testing Optimizers
  for i, names in zip(opts, opt_names):
    model = build_model()
    time_elapsed = []

    # Testing Batches
    for j in batch:
      print("_"*100)
      print("OPTIMIZER: ", names)
      print("BATCH SIZE: ", j)
      print("_"*100)
      print()
      model.compile(optimizer=i,
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])
      
      # TO DO: Tweek converge criterion
      # Callbacks for convergence criterion
      es = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.001, verbose = 1)
      checkpoint = tf.keras.callbacks.ModelCheckpoint("best_model", monitor = 'accuracy' ,verbose = 1, mode = 'max', save_best_only = True)
      callbacks = [es, checkpoint]
      start_time = time.clock()
      history = model.fit(X_train, y_train, batch_size = j, epochs = 100, callbacks = [es, checkpoint])
      end_time = time.clock()
      time_elapsed.append(end_time - start_time)
      test_loss, test_acc = model.evaluate(X_test,  y_test, verbose=1) 
      print("="*100)
      print("Test Accuracy: ", test_acc)
      print("="*100)

      if (i == SGD and j == 32):
        plot_sgd_ctl(history)

      # Destroy the graph
      keras.backend.clear_session()

    plot_batch_time(time_elapsed, batch, names)  


def build_model():
  model = keras.Sequential([
                            keras.layers.Conv2D(filters= 20, kernel_size=3, data_format='channels_last', activation='relu',input_shape = (28,28,1)),
                            keras.layers.MaxPool2D(pool_size = 2),
                            keras.layers.Dropout(rate = 0.5),
                            keras.layers.Flatten(),
                            keras.layers.Dense(units = 128, activation = 'relu', use_bias=True),
                            keras.layers.Dropout(rate = 0.5),
                            keras.layers.Dense(units = 10, activation='softmax')
                            ])
                            
  return model

# Graphs the Culumative Training Loss for SGD
def plot_sgd_ctl(history):
  plt.plot(history.history['accuracy'])
  plt.plot(history.history['loss'])
  plt.title('SGD: Model Accuracy/Loss per Epoch')
  plt.ylabel('Accuracy/Loss')
  plt.xlabel('Epoch #')
  plt.legend(['accuracy', 'loss'], loc='upper left')
  plt.grid()
  plt.show() 
  # Uncomment when you want to save
  # plt.savefig("sgd_ctl_plot.png")

# Graphs Batch vs Time
def plot_batch_time(time, batch, name):
  plt.plot(batch, time)
  plt.title(f"{name}: Time vs Batch Size")
  plt.ylabel("Time to Convergence")
  plt.xlabel("Batch Size")
  plt.grid()
  plt.show()
  # Uncomment when you want to save
  # plt.savefig(f"{name}_plot.png")


def main():
  cnn()

if __name__ == "__main__":
  main()