Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [13]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from tensorflow import keras

In [94]:
from sklearn.metrics import classification_report

First reload the data we generated in `1_notmnist.ipynb`.

In [14]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [15]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
    dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
    # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [16]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [115]:
def classification_print(y_true, y_pred):
    if len(np.shape(y_true)) > 1:
        y_true = np.argmax(y_true, axis=1)
    if len(np.shape(y_pred)) > 1:
        y_pred = np.argmax(y_pred, axis=1)
    print(classification_report(y_true, y_pred))

In [245]:
def train_mnist(X_train=train_dataset, y_train=train_labels, 
                X_test=test_dataset, y_test=test_labels,
                learning_rate = .1,
                dropout = False,
                dropout_rate = .5,
                nn=True,
                multiple_nn = False,
                regularizer=0., 
                epochs=2, 
                batch_size=128,
                optimizer=keras.optimizers.SGD,
                callback = [],
                validation_data=(valid_dataset, valid_labels)):

    model = keras.models.Sequential()
    model.add(keras.layers.Flatten(input_shape=(784,)))
    if nn:
        if multiple_nn:
            model.add(keras.layers.Dense(100, activation=tf.nn.relu,
                                        kernel_regularizer=keras.regularizers.l2(regularizer)))
            model.add(keras.layers.Dense(100, activation=tf.nn.relu,
                                        kernel_regularizer=keras.regularizers.l2(regularizer)))
        model.add(keras.layers.Dense(1024, activation=tf.nn.relu, 
                                     kernel_regularizer=keras.regularizers.l2(regularizer)))
        if dropout:
            model.add(keras.layers.Dropout(dropout_rate))
    model.add(keras.layers.Dense(10, activation=tf.nn.softmax, 
                                 kernel_regularizer=keras.regularizers.l2(regularizer)))
    
    model.compile(optimizer=optimizer(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    history = model.fit(X_train, y_train, 
                        epochs=epochs, 
                        batch_size=batch_size,
                        validation_data=validation_data,
                        callbacks=callback)
    print(f'Test accuracy: {model.evaluate(X_test, y_test, verbose=0)[1]*100:.2f}% with regularizer: {regularizer}')
    classification_print(y_test, model.predict(X_test))
    return model, history

In [246]:
logistic_model, logisitc_history = train_mnist(nn=False)

Train on 200000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 89.61% with regularizer: 0.0
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1000
           1       0.88      0.90      0.89      1000
           2       0.90      0.93      0.92      1000
           3       0.93      0.91      0.92      1000
           4       0.90      0.86      0.88      1000
           5       0.87      0.93      0.90      1000
           6       0.90      0.89      0.89      1000
           7       0.92      0.89      0.90      1000
           8       0.87      0.84      0.86      1000
           9       0.87      0.90      0.88      1000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [214]:
logistic_model.summary()

Model: "sequential_92"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_91 (Flatten)         (None, 784)               0         
_________________________________________________________________
dense_136 (Dense)            (None, 10)                7850      
Total params: 7,850
Trainable params: 7,850
Non-trainable params: 0
_________________________________________________________________


In [163]:
logistic_model, logisitc_history = train_mnist(nn=False, regularizer=.01)

Train on 200000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 88.85% with regularizer:0.01
              precision    recall  f1-score   support

           0       0.92      0.89      0.91      1000
           1       0.93      0.87      0.90      1000
           2       0.91      0.93      0.92      1000
           3       0.91      0.91      0.91      1000
           4       0.91      0.86      0.89      1000
           5       0.91      0.91      0.91      1000
           6       0.90      0.89      0.90      1000
           7       0.92      0.87      0.89      1000
           8       0.80      0.86      0.83      1000
           9       0.80      0.89      0.85      1000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [162]:
logistic_model, logisitc_history = train_mnist(nn=False, regularizer=.001)

Train on 200000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 89.71% with regularizer:0.001
              precision    recall  f1-score   support

           0       0.92      0.91      0.91      1000
           1       0.92      0.89      0.90      1000
           2       0.89      0.93      0.91      1000
           3       0.91      0.93      0.92      1000
           4       0.90      0.87      0.88      1000
           5       0.92      0.92      0.92      1000
           6       0.91      0.89      0.90      1000
           7       0.90      0.89      0.90      1000
           8       0.84      0.85      0.85      1000
           9       0.86      0.90      0.88      1000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [164]:
logistic_model, logisitc_history = train_mnist(nn=False, regularizer=1)

Train on 200000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 72.71% with regularizer:1
              precision    recall  f1-score   support

           0       0.94      0.68      0.79      1000
           1       0.79      0.74      0.76      1000
           2       0.55      0.89      0.68      1000
           3       0.89      0.80      0.84      1000
           4       0.87      0.66      0.75      1000
           5       0.93      0.76      0.84      1000
           6       0.96      0.36      0.52      1000
           7       0.92      0.66      0.77      1000
           8       0.69      0.83      0.75      1000
           9       0.45      0.89      0.60      1000

    accuracy                           0.73     10000
   macro avg       0.80      0.73      0.73     10000
weighted avg       0.80      0.73      0.73     10000



In [146]:
nn_model, nn_history = train_mnist()

Train on 200000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 90.85% with regularizer:0.0
              precision    recall  f1-score   support

           0       0.93      0.92      0.92      1000
           1       0.91      0.89      0.90      1000
           2       0.91      0.94      0.93      1000
           3       0.93      0.92      0.92      1000
           4       0.93      0.88      0.90      1000
           5       0.92      0.93      0.92      1000
           6       0.91      0.91      0.91      1000
           7       0.93      0.90      0.91      1000
           8       0.87      0.88      0.87      1000
           9       0.86      0.93      0.89      1000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000



In [147]:
nn_model, nn_history = train_mnist(regularizer=.01)

Train on 200000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 89.37% with regularizer:0.01
              precision    recall  f1-score   support

           0       0.94      0.90      0.91      1000
           1       0.92      0.87      0.89      1000
           2       0.90      0.92      0.91      1000
           3       0.92      0.91      0.91      1000
           4       0.91      0.85      0.88      1000
           5       0.87      0.94      0.90      1000
           6       0.92      0.89      0.90      1000
           7       0.91      0.88      0.90      1000
           8       0.85      0.86      0.85      1000
           9       0.83      0.92      0.87      1000

    accuracy                           0.89     10000
   macro avg       0.90      0.89      0.89     10000
weighted avg       0.90      0.89      0.89     10000



In [148]:
nn_model, nn_history = train_mnist(regularizer=.001)

Train on 200000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 90.80% with regularizer:0.001
              precision    recall  f1-score   support

           0       0.94      0.91      0.92      1000
           1       0.92      0.89      0.90      1000
           2       0.92      0.93      0.92      1000
           3       0.92      0.93      0.92      1000
           4       0.92      0.88      0.90      1000
           5       0.90      0.94      0.92      1000
           6       0.91      0.91      0.91      1000
           7       0.92      0.89      0.91      1000
           8       0.87      0.86      0.87      1000
           9       0.87      0.94      0.90      1000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000



In [150]:
nn_model, nn_history = train_mnist(regularizer=.0001)

Train on 200000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 90.64% with regularizer:0.0001
              precision    recall  f1-score   support

           0       0.94      0.91      0.92      1000
           1       0.91      0.88      0.90      1000
           2       0.93      0.93      0.93      1000
           3       0.90      0.92      0.91      1000
           4       0.92      0.89      0.90      1000
           5       0.89      0.94      0.91      1000
           6       0.91      0.91      0.91      1000
           7       0.91      0.90      0.91      1000
           8       0.87      0.87      0.87      1000
           9       0.88      0.92      0.90      1000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000



In [149]:
nn_model, nn_history = train_mnist(regularizer=1)

Train on 200000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 10.00% with regularizer:1
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1000
           1       0.00      0.00      0.00      1000
           2       0.00      0.00      0.00      1000
           3       0.00      0.00      0.00      1000
           4       0.10      1.00      0.18      1000
           5       0.00      0.00      0.00      1000
           6       0.00      0.00      0.00      1000
           7       0.00      0.00      0.00      1000
           8       0.00      0.00      0.00      1000
           9       0.00      0.00      0.00      1000

    accuracy                           0.10     10000
   macro avg       0.01      0.10      0.02     10000
weighted avg       0.01      0.10      0.02     10000



  'precision', 'predicted', average, warn_for)


---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [167]:
log_model, _ = train_mnist(X_train=train_dataset[:1000,:],y_train = train_labels[:1000,:],nn=False, regularizer=.001)

Train on 1000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 79.80% with regularizer:0.001
              precision    recall  f1-score   support

           0       0.91      0.78      0.84      1000
           1       0.88      0.75      0.81      1000
           2       0.82      0.80      0.81      1000
           3       0.83      0.83      0.83      1000
           4       0.77      0.71      0.74      1000
           5       0.68      0.91      0.78      1000
           6       0.81      0.82      0.81      1000
           7       0.90      0.75      0.82      1000
           8       0.80      0.79      0.80      1000
           9       0.68      0.83      0.75      1000

    accuracy                           0.80     10000
   macro avg       0.81      0.80      0.80     10000
weighted avg       0.81      0.80      0.80     10000



In [168]:
nn_model, _ = train_mnist(X_train=train_dataset[:1000,:], y_train = train_labels[:1000,:],regularizer=.001)

Train on 1000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 82.98% with regularizer:0.001
              precision    recall  f1-score   support

           0       0.77      0.84      0.80      1000
           1       0.90      0.81      0.85      1000
           2       0.79      0.91      0.84      1000
           3       0.81      0.88      0.84      1000
           4       0.89      0.76      0.82      1000
           5       0.76      0.92      0.83      1000
           6       0.91      0.78      0.84      1000
           7       0.91      0.80      0.85      1000
           8       0.78      0.84      0.81      1000
           9       0.85      0.77      0.80      1000

    accuracy                           0.83     10000
   macro avg       0.84      0.83      0.83     10000
weighted avg       0.84      0.83      0.83     10000



Test accuracy is lower due to overfitting.

---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [146]:
nn_model, nn_history = train_mnist()

Train on 200000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 90.85% with regularizer:0.0
              precision    recall  f1-score   support

           0       0.93      0.92      0.92      1000
           1       0.91      0.89      0.90      1000
           2       0.91      0.94      0.93      1000
           3       0.93      0.92      0.92      1000
           4       0.93      0.88      0.90      1000
           5       0.92      0.93      0.92      1000
           6       0.91      0.91      0.91      1000
           7       0.93      0.90      0.91      1000
           8       0.87      0.88      0.87      1000
           9       0.86      0.93      0.89      1000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000



In [170]:
nn_model, _ = train_mnist(dropout=True)

Train on 200000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 93.73% with regularizer:0.0
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      1000
           1       0.94      0.93      0.93      1000
           2       0.94      0.96      0.95      1000
           3       0.93      0.95      0.94      1000
           4       0.96      0.92      0.94      1000
           5       0.96      0.95      0.95      1000
           6       0.92      0.94      0.93      1000
           7       0.95      0.94      0.94      1000
           8       0.93      0.90      0.91      1000
           9       0.91      0.95      0.93      1000

    accuracy                           0.94     10000
   macro avg       0.94      0.94      0.94     10000
weighted avg       0.94      0.94      0.94     10000



In [171]:
nn_model, _ = train_mnist(dropout=True, regularizer=.001)

Train on 200000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 92.87% with regularizer:0.001
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1000
           1       0.92      0.93      0.92      1000
           2       0.93      0.95      0.94      1000
           3       0.93      0.93      0.93      1000
           4       0.95      0.92      0.93      1000
           5       0.93      0.95      0.94      1000
           6       0.92      0.93      0.92      1000
           7       0.97      0.91      0.94      1000
           8       0.92      0.89      0.90      1000
           9       0.87      0.95      0.91      1000

    accuracy                           0.93     10000
   macro avg       0.93      0.93      0.93     10000
weighted avg       0.93      0.93      0.93     10000



Dropout did a great job on avoiding overfitting.

---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [174]:
nn_model, _ = train_mnist(multiple_nn=True, dropout=True)

Train on 200000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 93.33% with regularizer: 0.0
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1000
           1       0.94      0.92      0.93      1000
           2       0.94      0.94      0.94      1000
           3       0.93      0.95      0.94      1000
           4       0.93      0.93      0.93      1000
           5       0.93      0.95      0.94      1000
           6       0.91      0.94      0.92      1000
           7       0.95      0.93      0.94      1000
           8       0.92      0.91      0.91      1000
           9       0.94      0.93      0.93      1000

    accuracy                           0.93     10000
   macro avg       0.93      0.93      0.93     10000
weighted avg       0.93      0.93      0.93     10000



let's try more epochs.

In [179]:
nn_model, _ = train_mnist(multiple_nn=True, dropout=True, epochs=10)

Train on 200000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 95.34% with regularizer: 0.0
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      1000
           1       0.96      0.94      0.95      1000
           2       0.96      0.96      0.96      1000
           3       0.95      0.96      0.96      1000
           4       0.95      0.95      0.95      1000
           5       0.97      0.95      0.96      1000
           6       0.93      0.95      0.94      1000
           7       0.98      0.94      0.96      1000
           8       0.94      0.93      0.93      1000
           9       0.94      0.96      0.95      1000

    accuracy                           0.95     10000
   macro avg       0.95      0.95      0.95     10000
weighted avg       0.95      0.95      0.95     10000



good job. we have time.

now try decay learning rate.

In [192]:
initial_learning_rate = .5
lr_schedule = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate,
                                                        decay_steps = 10000,
                                                        decay_rate = .96,
                                                        staircase = True)

In [193]:
nn_model, _ = train_mnist(multiple_nn=True, dropout=True, epochs=10, learning_rate=lr_schedule)

Train on 200000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 95.19% with regularizer: 0.0
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1000
           1       0.96      0.94      0.95      1000
           2       0.96      0.97      0.96      1000
           3       0.94      0.97      0.95      1000
           4       0.95      0.95      0.95      1000
           5       0.96      0.96      0.96      1000
           6       0.96      0.95      0.95      1000
           7       0.98      0.94      0.96      1000
           8       0.93      0.92      0.92      1000
           9       0.94      0.96      0.95      1000

    accuracy                           0.95     10000
   macro avg       0.95      0.95      0.95     10000
weighted avg       0.95      0.95      0.95     10000



didn't change much.

now try more epochs with callbacks.

In [230]:
# EarlyStopping's monitor should use validation_data's params, like val_loss or val_acc.
callback = keras.callbacks.EarlyStopping(monitor='val_acc', mode='max', patience=2)
checkpoint = keras.callbacks.ModelCheckpoint('checkpoint/nn_model.ckpt',
                                            monitor='val_acc',
                                            verbose=1,
                                            save_best_only=True,
                                            load_weights_on_restart=True)

In [249]:
model, _ = train_mnist(multiple_nn=True, dropout=True, epochs=100, 
                          learning_rate=lr_schedule, 
                          callback=[callback, checkpoint])

Train on 200000 samples, validate on 10000 samples
Epoch 1/100
Epoch 00001: val_acc did not improve from 0.90270
Epoch 2/100
Epoch 00002: val_acc did not improve from 0.90270
Epoch 3/100
Epoch 00003: val_acc did not improve from 0.90270
Epoch 4/100
Epoch 00004: val_acc did not improve from 0.90270
Test accuracy: 95.44% with regularizer: 0.0
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      1000
           1       0.97      0.95      0.96      1000
           2       0.96      0.96      0.96      1000
           3       0.95      0.96      0.96      1000
           4       0.96      0.95      0.96      1000
           5       0.98      0.95      0.97      1000
           6       0.95      0.95      0.95      1000
           7       0.98      0.94      0.96      1000
           8       0.91      0.95      0.93      1000
           9       0.95      0.95      0.95      1000

    accuracy                           0.95     10000
   macr

looks like 95% is the best?

try another optimizer.

In [252]:
model.fit(train_dataset, train_labels, validation_data=(valid_dataset, valid_labels), callbacks=[callback, checkpoint])

Train on 200000 samples, validate on 10000 samples
 13824/200000 [=>............................] - ETA: 28s - loss: nan - acc: 0.1427

KeyboardInterrupt: 

In [238]:
model.save('test.h5')
model.summary()

Model: "sequential_97"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_96 (Flatten)         (None, 784)               0         
_________________________________________________________________
dense_153 (Dense)            (None, 100)               78500     
_________________________________________________________________
dense_154 (Dense)            (None, 100)               10100     
_________________________________________________________________
dense_155 (Dense)            (None, 1024)              103424    
_________________________________________________________________
dropout_15 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_156 (Dense)            (None, 10)                10250     
Total params: 202,274
Trainable params: 202,274
Non-trainable params: 0
_______________________________________________

further training on existing model.

In [242]:
new_model = keras.models.load_model('test.h5')

In [235]:
np.testing.assert_allclose(model.predict(train_dataset),
                          new_model.predict(train_dataset))

In [243]:
new_model.summary()

Model: "sequential_97"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_96 (Flatten)         (None, 784)               0         
_________________________________________________________________
dense_153 (Dense)            (None, 100)               78500     
_________________________________________________________________
dense_154 (Dense)            (None, 100)               10100     
_________________________________________________________________
dense_155 (Dense)            (None, 1024)              103424    
_________________________________________________________________
dropout_15 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_156 (Dense)            (None, 10)                10250     
Total params: 202,274
Trainable params: 202,274
Non-trainable params: 0
_______________________________________________

In [244]:
new_model.evaluate(test_dataset, test_labels)



[0.22576687259974715, 0.9542]