# Gradient Descent

In [0]:
%matplotlib inline
import tensorflow as tf  # deep learning library. Tensors are just multi-dimensional arrays
import matplotlib.pyplot as plt
import tensorflow.keras.backend as K

## Data Import

In [4]:
mnist = tf.keras.datasets.mnist  # mnist is a dataset of 28x28 images of handwritten digits and their labels
(x_train, y_train),(x_test, y_test) = mnist.load_data()  # unpacks images to x_train/x_test and labels to y_train/y_test

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [0]:
x_train = tf.keras.utils.normalize(x_train, axis=1)  # scales data between 0 and 1
x_test = tf.keras.utils.normalize(x_test, axis=1)  # scales data between 0 and 1

## Compare Learning Rates

In [0]:
from tensorflow.keras.optimizers import SGD

dflist = []

learning_rates = [0.01, 0.05, 0.1, 0.5]

for lr in learning_rates:

  K.clear_session()

  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Flatten())  # takes our 28x28 and makes it 1x784
  model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))  # a simple fully-connected layer, 128 units, relu activation
  model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))  # a simple fully-connected layer, 128 units, relu activation
  model.add(tf.keras.layers.Dense(10, activation=tf.nn.softmax))  # our output layer. 10 units for 10 classes. Softmax for probability distribution

  model.compile(optimizer=SGD(lr=lr),  # Good default optimizer to start with
                loss='sparse_categorical_crossentropy',  # how will we calculate our "error." Neural network aims to minimize loss.
                metrics=['accuracy'])  # what to track

  h =  model.fit(x_train, y_train, batch_size=16, verbose=0)  

  dflist.append(pd.DataFrame(h.history, index=h.epoch))

In [15]:
h.history.keys()

dict_keys(['loss', 'acc'])

In [0]:
historydf = pd.concat(dflist, axis=1)

In [0]:
metrics_reported = dflist[0].columns
idx = pd.MultiIndex.from_product([learning_rates, metrics_reported],
                                 names=['learning_rate', 'metric'])

historydf.columns = idx

In [23]:
historydf

learning_rate,0.01,0.01,0.05,0.05,0.10,0.10,0.50,0.50
metric,acc,loss,acc,loss,acc,loss,acc,loss
0,0.81235,0.741103,0.8938,0.367638,0.912683,0.292995,0.92175,0.259577


## Batch Size

In [0]:
dflist = []

batch_sizes = [16, 32, 64, 128]

for batch_size in batch_sizes:
  K.clear_session()

  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Flatten())  # takes our 28x28 and makes it 1x784
  model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))  # a simple fully-connected layer, 128 units, relu activation
  model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))  # a simple fully-connected layer, 128 units, relu activation
  model.add(tf.keras.layers.Dense(10, activation=tf.nn.softmax))  # our output layer. 10 units for 10 classes. Softmax for probability distribution

  model.compile(optimizer='sgd',  # Good default optimizer to start with
                loss='sparse_categorical_crossentropy',  # how will we calculate our "error." Neural network aims to minimize loss.
                metrics=['accuracy'])  # what to track

  h =  model.fit(x_train, y_train, batch_size=batch_size, verbose=0)  

  dflist.append(pd.DataFrame(h.history, index=h.epoch))  

In [0]:
historydf = pd.concat(dflist, axis=1)
metrics_reported = dflist[0].columns
idx = pd.MultiIndex.from_product([batch_sizes, metrics_reported],
                                 names=['batch_size', 'metric'])
historydf.columns = idx

In [30]:
historydf

batch_size,16,16,32,32,64,64,128,128
metric,acc,loss,acc,loss,acc,loss,acc,loss
0,0.805883,0.758128,0.739983,1.078245,0.613867,1.606198,0.46,2.013939


## Optimizers

In [0]:
from tensorflow.keras.optimizers import SGD, Adam, Adagrad, RMSprop

In [0]:
dflist = []

optimizers = ['SGD(lr=0.01)',
              'SGD(lr=0.01, momentum=0.3)',
              'SGD(lr=0.01, momentum=0.3, nesterov=True)',  
              'Adam(lr=0.01)',
              'Adagrad(lr=0.01)',
              'RMSprop(lr=0.01)']

for opt_name in optimizers:

  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Flatten())  # takes our 28x28 and makes it 1x784
  model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))  # a simple fully-connected layer, 128 units, relu activation
  model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))  # a simple fully-connected layer, 128 units, relu activation
  model.add(tf.keras.layers.Dense(10, activation=tf.nn.softmax))  # our output layer. 10 units for 10 classes. Softmax for probability distribution

  model.compile(optimizer=eval(opt_name),  # Good default optimizer to start with
                loss='sparse_categorical_crossentropy',  # how will we calculate our "error." Neural network aims to minimize loss.
                metrics=['accuracy'])  # what to track

  h =  model.fit(x_train, y_train, batch_size=16, verbose=0)  

  dflist.append(pd.DataFrame(h.history, index=h.epoch))     

In [0]:
historydf = pd.concat(dflist, axis=1)
metrics_reported = dflist[0].columns
idx = pd.MultiIndex.from_product([optimizers, metrics_reported],
                                 names=['optimizers', 'metric'])
historydf.columns = idx

In [34]:
historydf

optimizers,SGD(lr=0.01),SGD(lr=0.01),"SGD(lr=0.01, momentum=0.3)","SGD(lr=0.01, momentum=0.3)","SGD(lr=0.01, momentum=0.3, nesterov=True)","SGD(lr=0.01, momentum=0.3, nesterov=True)",Adam(lr=0.01),Adam(lr=0.01),Adagrad(lr=0.01),Adagrad(lr=0.01),RMSprop(lr=0.01),RMSprop(lr=0.01)
metric,acc,loss,acc,loss,acc,loss,acc,loss,acc,loss,acc,loss
0,0.800367,0.787923,0.832383,0.630575,0.84125,0.619871,0.91785,0.290038,0.92875,0.240809,0.91375,0.414608


## Initialization

In [0]:
dflist = []

initializers = ['zeros', 'uniform', 'normal',
                'he_normal', 'lecun_uniform']

for init in initializers:

  K.clear_session()

  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Flatten())  # takes our 28x28 and makes it 1x784
  model.add(tf.keras.layers.Dense(128, kernel_initializer=init, activation=tf.nn.relu))  # a simple fully-connected layer, 128 units, relu activation
  model.add(tf.keras.layers.Dense(128, kernel_initializer=init, activation=tf.nn.relu))  # a simple fully-connected layer, 128 units, relu activation
  model.add(tf.keras.layers.Dense(10, kernel_initializer=init, activation=tf.nn.softmax))  # our output layer. 10 units for 10 classes. Softmax for probability distribution

  model.compile(optimizer='rmsprop',  # Good default optimizer to start with
                loss='sparse_categorical_crossentropy',  # how will we calculate our "error." Neural network aims to minimize loss.
                metrics=['accuracy'])  # what to track

  h =  model.fit(x_train, y_train, batch_size=16, verbose=0)  

  
  dflist.append(pd.DataFrame(h.history, index=h.epoch))

In [0]:
historydf = pd.concat(dflist, axis=1)
metrics_reported = dflist[0].columns
idx = pd.MultiIndex.from_product([initializers, metrics_reported],
                                 names=['initializers', 'metric'])

historydf.columns = idx

In [37]:
historydf

initializers,zeros,zeros,uniform,uniform,normal,normal,he_normal,he_normal,lecun_uniform,lecun_uniform
metric,acc,loss,acc,loss,acc,loss,acc,loss,acc,loss
0,0.111767,2.301561,0.910033,0.302547,0.919583,0.274092,0.92925,0.24182,0.923467,0.252066
