In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import tensorflow as tf
from tensorflow import keras

%load_ext autoreload
%autoreload 2

import main

In [103]:
labels = pd.Series(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'])

train_path = '../blobs/notMNIST_large'
test_path = '../blobs/notMNIST_small'

Loading file names

In [104]:
from sklearn.model_selection import train_test_split

x_train_files = []
y_train = []
x_val_files = []
y_val = []
x_test_files = []
y_test = []
for label, index in zip(labels, labels.index):    
    train_dir = os.path.join(train_path, label)
    files = pd.Series(os.path.join(train_dir, name) for name in os.listdir(train_dir))
    # NOTE: reducing the number of samples for faster processing
    files = files.sample(len(files) // 10)
    x_train_files.extend(files)
    y_train.extend(index for _ in range(len(files)))
    
    # val and test must come from the same distribution
    test_dir = os.path.join(test_path, label)
    test_files = [os.path.join(test_dir, name) for name in os.listdir(test_dir)]
    val, test = train_test_split(test_files, test_size=0.5)
    x_val_files.extend(val)
    y_val.extend(index for _ in range(len(val)))
    x_test_files.extend(test)
    y_test.extend(index for _ in range(len(test)))
    
print('# of train files:', len(x_train_files))
assert(len(x_train_files) == len(y_train))
print('# of val files:', len(x_val_files))
assert(len(x_val_files) == len(y_val))
print('# of test files:', len(x_test_files))
assert(len(x_test_files) == len(y_test))

# of train files: 51696
# of val files: 9360
# of test files: 9364


Shuffling file names to avoid labels being consecutive

In [4]:
main.shuffle_in_unison(x_train_files, y_train)
main.shuffle_in_unison(x_val_files, y_val)
main.shuffle_in_unison(x_test_files, y_test)

In [105]:
x_test = np.array([main.load_image(file) for file in x_test_files])
y_test = np.array(y_test)
x_test = x_test / 255.0
print('Test set:', x_test.shape)

Test set: (9364, 28, 28)


In [7]:
x_train = np.array([main.load_image(file) for file in x_train_files])
y_train= np.array(y_train)
x_train = x_train / 255.0
print(x_train.shape)

(51696, 28, 28)


In [106]:
x_val = np.array([main.load_image(file) for file in x_val_files])
y_val= np.array(y_val)
x_val = x_val / 255.0
print('Validation set:', x_val.shape)

Validation set: (9360, 28, 28)


In [107]:
model = keras.Sequential([
    keras.layers.Flatten(),
    keras.layers.Dense(150, activation='relu'),
    keras.layers.Dense(50, activation='relu'),
    keras.layers.Dense(10)
])
model.compile(
    optimizer='SGD', 
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
    metrics=['accuracy']
)

In [89]:
model.fit(x_train, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f464d128630>

In [90]:
print('Test prediction:')
score, accuracy = model.evaluate(x_test, y_test)

Test prediction:


Logistic regression was at about 88% accuracy on 50k train samples. Here we got **92.5%** without any fine-tuning, although the model overits

Let's try **regularization**:

In [91]:
reg_model = keras.Sequential([
    keras.layers.Flatten(),
    keras.layers.Dense(150, activation='relu', kernel_regularizer=keras.regularizers.l2(0.0001)),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(50, activation='relu', kernel_regularizer=keras.regularizers.l2(0.0001)),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(10, kernel_regularizer=keras.regularizers.l2(0.0001))
])
reg_model.compile(
    optimizer='sgd', 
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
    metrics=['accuracy']
)

In [92]:
reg_history = reg_model.fit(x_train, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [112]:
# Using this to tune the drop rate and L2 parameters
print('Validation prediction:')
score, accuracy = reg_model.evaluate(x_val, y_val)

Validation prediction:


In [113]:
print('Test prediction:')
score, accuracy = reg_model.evaluate(x_test, y_test)

Test prediction:


How about adding **learning rate decay**:

Using regularization increased the accuracy, though not by much, about **1%**. But now we can safely run even more epochs without the fear of overfitting.

In [114]:
adaptive_model = keras.models.clone_model(reg_model)
# We can also try using other algorithms instead of decay in SGD
adaptive_model.compile(
    optimizer=keras.optimizers.SGD(decay=1e-5), 
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
    metrics=['accuracy']
)

In [100]:
adaptive_history = adaptive_model.fit(x_train, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [110]:
print('Validation prediction:')
score, accuracy = adaptive_model.evaluate(x_val, y_val)

Validation prediction:


In [111]:
print('Test prediction:')
score, accuracy = adaptive_model.evaluate(x_test, y_test)

Test prediction:


Rate decay doesn't seem to help much here, results a bit worse. Could try more epochs

What else we can **try**:

- load all the available train data
- teach for more epochs
- try other optimization algorithms, e.g. Adam
- play with the number of layers and hidden neurons using validation set
- try one-hot output encoding
- change activation functions (unlikely)