In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import imageio
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from datetime import datetime

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline


# GET timestamp to display
def display_time():
    t = datetime.now()
    return t.strftime("[ %I:%M:%S %p %D ]")

In [2]:
num_classes = 10
np.random.seed(133)

def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall(data_root)
    tar.close()
  data_folders = [
    os.path.join(root, d) for d in sorted(os.listdir(root))
    if os.path.isdir(os.path.join(root, d))]
  if len(data_folders) != num_classes:
    raise Exception(
      'Expected %d folders, one per class. Found %d instead.' % (
        num_classes, len(data_folders)))
  print(data_folders)
  return data_folders
  
train_folders = maybe_extract('./notMNIST_large')
test_folders = maybe_extract('./notMNIST_small')

./notMNIST_large already present - Skipping extraction of ./notMNIST_large.
['./notMNIST_large/A', './notMNIST_large/B', './notMNIST_large/C', './notMNIST_large/D', './notMNIST_large/E', './notMNIST_large/F', './notMNIST_large/G', './notMNIST_large/H', './notMNIST_large/I', './notMNIST_large/J']
./notMNIST_small already present - Skipping extraction of ./notMNIST_small.
['./notMNIST_small/A', './notMNIST_small/B', './notMNIST_small/C', './notMNIST_small/D', './notMNIST_small/E', './notMNIST_small/F', './notMNIST_small/G', './notMNIST_small/H', './notMNIST_small/I', './notMNIST_small/J']


In [3]:
image_size = 28  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.

def load_letter(folder, min_num_images):
  """Load the data for a single letter label."""
  image_files = os.listdir(folder)
  dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
                         dtype=np.float32)
  print(folder)
  num_images = 0
  for image in image_files:
    image_file = os.path.join(folder, image)
    try:
      image_data = (imageio.imread(image_file).astype(float) - 
                    pixel_depth / 2) / pixel_depth
      if image_data.shape != (image_size, image_size):
        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      dataset[num_images, :, :] = image_data
      num_images = num_images + 1
    except (IOError, ValueError) as e:
      print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
    
  dataset = dataset[0:num_images, :, :]
  if num_images < min_num_images:
    raise Exception('Many fewer images than expected: %d < %d' %
                    (num_images, min_num_images))
    
  print('Full dataset tensor:', dataset.shape)
  print('Mean:', np.mean(dataset))
  print('Standard deviation:', np.std(dataset))
  return dataset
        
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
  dataset_names = []
  for folder in data_folders:
    set_filename = folder + '.pickle'
    dataset_names.append(set_filename)
    if os.path.exists(set_filename) and not force:
      # You may override by setting force=True.
      print('%s already present - Skipping pickling.' % set_filename)
    else:
      print('Pickling %s.' % set_filename)
      dataset = load_letter(folder, min_num_images_per_class)
      try:
        with open(set_filename, 'wb') as f:
          pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
      except Exception as e:
        print('Unable to save data to', set_filename, ':', e)
  
  return dataset_names

train_datasets = maybe_pickle(train_folders, 45000)
test_datasets = maybe_pickle(test_folders, 1800)

./notMNIST_large/A.pickle already present - Skipping pickling.
./notMNIST_large/B.pickle already present - Skipping pickling.
./notMNIST_large/C.pickle already present - Skipping pickling.
./notMNIST_large/D.pickle already present - Skipping pickling.
./notMNIST_large/E.pickle already present - Skipping pickling.
./notMNIST_large/F.pickle already present - Skipping pickling.
./notMNIST_large/G.pickle already present - Skipping pickling.
./notMNIST_large/H.pickle already present - Skipping pickling.
./notMNIST_large/I.pickle already present - Skipping pickling.
./notMNIST_large/J.pickle already present - Skipping pickling.
./notMNIST_small/A.pickle already present - Skipping pickling.
./notMNIST_small/B.pickle already present - Skipping pickling.
./notMNIST_small/C.pickle already present - Skipping pickling.
./notMNIST_small/D.pickle already present - Skipping pickling.
./notMNIST_small/E.pickle already present - Skipping pickling.
./notMNIST_small/F.pickle already present - Skipping pi

In [4]:
def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):
  num_classes = len(pickle_files)
  valid_dataset, valid_labels = make_arrays(valid_size, image_size)
  train_dataset, train_labels = make_arrays(train_size, image_size)
  vsize_per_class = valid_size // num_classes
  tsize_per_class = train_size // num_classes
    
  start_v, start_t = 0, 0
  end_v, end_t = vsize_per_class, tsize_per_class
  end_l = vsize_per_class+tsize_per_class
  for label, pickle_file in enumerate(pickle_files):       
    try:
      with open(pickle_file, 'rb') as f:
        letter_set = pickle.load(f)
        # let's shuffle the letters to have random validation and training set
        np.random.shuffle(letter_set)
        if valid_dataset is not None:
          valid_letter = letter_set[:vsize_per_class, :, :]
          valid_dataset[start_v:end_v, :, :] = valid_letter
          valid_labels[start_v:end_v] = label
          start_v += vsize_per_class
          end_v += vsize_per_class
                    
        train_letter = letter_set[vsize_per_class:end_l, :, :]
        train_dataset[start_t:end_t, :, :] = train_letter
        train_labels[start_t:end_t] = label
        start_t += tsize_per_class
        end_t += tsize_per_class
    except Exception as e:
      print('Unable to process data from', pickle_file, ':', e)
      raise
    
  return valid_dataset, valid_labels, train_dataset, train_labels
            
            
train_size = 200000
valid_size = 10000
test_size = 10000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)


train_dataset = train_dataset.reshape((train_dataset.shape[0], train_dataset.shape[1],
                       train_dataset.shape[2], 1))

test_dataset = test_dataset.reshape((test_dataset.shape[0], test_dataset.shape[1],
                       test_dataset.shape[2], 1))

valid_dataset = valid_dataset.reshape((valid_dataset.shape[0], valid_dataset.shape[1],
                       valid_dataset.shape[2], 1))


print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)

Training: (200000, 28, 28, 1) (200000,)
Validation: (10000, 28, 28, 1) (10000,)
Testing: (10000, 28, 28, 1) (10000,)


In [5]:
def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

In [6]:
import tensorflow as tf

reset_graph()


conv1_fmaps = 32
conv1_ksize = 3
conv1_stride = 1
conv1_pad = "SAME"

conv2_fmaps = 64
conv2_ksize = 3
conv2_stride = 2
conv2_pad = "SAME"

pool3_fmaps = conv2_fmaps

n_fc1 = 4096
n_fc2 = 1024
n_outputs = 10


In [7]:
X = tf.placeholder(tf.float32, shape=[None, 28, 28, 1], name="X")
y = tf.placeholder(tf.int32, shape=[None], name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')

In [8]:
with tf.name_scope("conv_layers"):
    conv1 = tf.layers.conv2d(X, filters=conv1_fmaps, 
                            kernel_size=conv1_ksize, 
                            strides=conv1_stride,
                            padding=conv1_pad,
                            activation=tf.nn.relu,
                            name="conv1")

    conv2 = tf.layers.conv2d(conv1, filters=conv2_fmaps, 
                            kernel_size=conv2_ksize, 
                            strides=conv2_stride,
                            padding=conv2_pad,
                            activation=tf.nn.relu,
                            name="conv2")

    pool3 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
    pool3_flat = tf.reshape(pool3, shape=[-1, pool3_fmaps * 7 * 7])

In [9]:
dropout_rate = 0.5  # == 1 - keep_prob
#X = tf.layers.dropout(pool3_flat, dropout_rate, training=training)

with tf.name_scope("fully_connected"):
    hidden1 = tf.layers.dense(pool3_flat, n_fc1, 
                             activation=tf.nn.relu, name="hidden1")
    hidden_dp = tf.layers.dropout(hidden1, dropout_rate, training=training)
    
    hidden2 = tf.layers.dense(hidden_dp, n_fc2, 
                             activation=tf.nn.relu, name="hidden2")
    hidden2_dp = tf.layers.dropout(hidden_dp, dropout_rate, training=training)
    
with tf.name_scope("output"):
    logits = tf.layers.dense(hidden2_dp, n_outputs, name="outputs")

In [10]:
learning_rate = 0.01
    
with tf.name_scope("train"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
    loss = tf.reduce_mean(xentropy)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

In [11]:
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [12]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [13]:
n_epochs = 10
batch_size = 100
train_size_batch = 200000


#extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        shuffled_indices = np.random.permutation(train_size)
        X_b_shuffled = train_dataset[shuffled_indices]
        y_shuffled = train_labels[shuffled_indices]
        
        for i in range(0, train_size_batch, batch_size):
            xi = X_b_shuffled[i:i+batch_size]
            yi = y_shuffled[i:i+batch_size]
            sess.run(training_op, feed_dict={X:xi, y:yi, training : True})
        
        acc_train = accuracy.eval(feed_dict={X:xi, y:yi})
        acc_test = accuracy.eval(feed_dict={X:test_dataset, y:test_labels})
        acc_valid = accuracy.eval(feed_dict={X:valid_dataset, y:valid_labels})
        print( display_time(), epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test, "Validate accuracy:", acc_valid)

    save_path = saver.save(sess, "./notMnist-4096-cnn-dp.ckpt")

0 Train accuracy: 0.85 Test accuracy: 0.8973
1 Train accuracy: 0.88 Test accuracy: 0.9111
2 Train accuracy: 0.88 Test accuracy: 0.9247
3 Train accuracy: 0.94 Test accuracy: 0.9328
4 Train accuracy: 0.93 Test accuracy: 0.938
5 Train accuracy: 0.81 Test accuracy: 0.9428
6 Train accuracy: 0.9 Test accuracy: 0.9476
7 Train accuracy: 0.91 Test accuracy: 0.9495
8 Train accuracy: 0.92 Test accuracy: 0.9521
9 Train accuracy: 0.87 Test accuracy: 0.953


In [18]:
n_epochs = 50
batch_size = 200
train_size_batch = 200000

#extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)


with tf.Session() as sess:
    saver.restore(sess, "./notMnist-4096-cnn-dp.ckpt") # or better, use save_path
    print(display_time(), "- START")
    for epoch in range(n_epochs):
        shuffled_indices = np.random.permutation(train_size)
        X_b_shuffled = train_dataset[shuffled_indices]
        y_shuffled = train_labels[shuffled_indices]
        
        for i in range(0, train_size_batch , batch_size):
            xi = X_b_shuffled[i:i+batch_size]
            yi = y_shuffled[i:i+batch_size]
            sess.run(training_op, feed_dict={X:xi, y:yi, training: True})
        
        acc_train = accuracy.eval(feed_dict={X:xi, y:yi})
        acc_test = accuracy.eval(feed_dict={X:test_dataset, y:test_labels})
        acc_valid = accuracy.eval(feed_dict={X:valid_dataset, y:valid_labels})
        print(display_time(), epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test, "Validate accuracy:", acc_valid)

    save_path = saver.save(sess, "./notMnist-4096-cnn-dp.ckpt")

INFO:tensorflow:Restoring parameters from ./notMnist-4096-cnn-dp.ckpt
[ 02:15:28 AM 01/06/18 ] - START
[ 02:25:52 AM 01/06/18 ] 0 Train accuracy: 1.0 Test accuracy: 0.9751 Validate accuracy: 0.9302
[ 02:36:17 AM 01/06/18 ] 1 Train accuracy: 1.0 Test accuracy: 0.9758 Validate accuracy: 0.9309
[ 02:46:41 AM 01/06/18 ] 2 Train accuracy: 0.995 Test accuracy: 0.9754 Validate accuracy: 0.9309
[ 02:57:03 AM 01/06/18 ] 3 Train accuracy: 0.995 Test accuracy: 0.9755 Validate accuracy: 0.9298
[ 03:07:29 AM 01/06/18 ] 4 Train accuracy: 0.99 Test accuracy: 0.9757 Validate accuracy: 0.9301
[ 03:17:53 AM 01/06/18 ] 5 Train accuracy: 0.995 Test accuracy: 0.9753 Validate accuracy: 0.9307
[ 03:28:17 AM 01/06/18 ] 6 Train accuracy: 1.0 Test accuracy: 0.9752 Validate accuracy: 0.9309
[ 03:38:41 AM 01/06/18 ] 7 Train accuracy: 0.985 Test accuracy: 0.9747 Validate accuracy: 0.9292
[ 03:49:05 AM 01/06/18 ] 8 Train accuracy: 1.0 Test accuracy: 0.9756 Validate accuracy: 0.9301
[ 03:59:30 AM 01/06/18 ] 9 Train 

In [19]:
with tf.Session() as sess:
    saver.restore(sess, "./notMnist-4096-cnn-dp.ckpt") # or better, use save_path
    Z = logits.eval(feed_dict={X: valid_dataset})
    y_pred = np.argmax(Z, axis=1)

INFO:tensorflow:Restoring parameters from ./notMnist-4096-cnn-dp.ckpt


In [20]:
from sklearn.metrics import accuracy_score

print("Accuracy : ", accuracy_score(y_pred, valid_labels))

Accuracy :  0.931


In [21]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_pred, valid_labels))

[[934   2   4   7   2  12   6   6   8   6]
 [  3 930   4  11  10   2   8   9  13   4]
 [  4   5 927   8  14   3  15   1   3   5]
 [ 11  16   2 942  10   3   2   4  10   9]
 [  3  11  15   5 925  12   3   6  10   6]
 [  6   8   2   2   6 933   5   6   5   5]
 [  7   8  26   4  13  13 937   6  11   8]
 [ 12   8   6   4   7   6   8 949   7   6]
 [ 12   3   9   9   4  12   9  10 912  30]
 [  8   9   5   8   9   4   7   3  21 921]]
