In [1]:
import os
try:
  import wget
except: 
  !pip install wget
  import wget
import tarfile


out_dir = 'data/not_mnist'
small_arhive = f'{out_dir}/notMNIST_small.tar.gz'
large_arhive = f'{out_dir}/notMNIST_large.tar.gz'
large_url = 'https://commondatastorage.googleapis.com/books1000/notMNIST_large.tar.gz'
small_url = 'https://commondatastorage.googleapis.com/books1000/notMNIST_small.tar.gz'

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9682 sha256=e1bb4d197cf80413558740827db93207d076b62fc77d683ef4b62d042fab0fb2
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
if not os.path.exists(out_dir):
  os.makedirs(out_dir)

if not os.path.exists(small_arhive):
  print(f"Downloading {small_arhive}.")
  wget.download(small_url, small_arhive)
  print()
else:
  print(f"Skipping {small_arhive} download (already exists)")

if not os.path.exists(large_arhive):
  print(f"Downloading {large_arhive}.")
  wget.download(large_url, large_arhive)
  print()
else:
  print(f"Skipping {large_arhive} download (already exists)")

Downloading data/not_mnist/notMNIST_small.tar.gz.

Downloading data/not_mnist/notMNIST_large.tar.gz.



In [3]:
print(f"Extracting {small_arhive}")
with tarfile.open(small_arhive) as tar:
  tar.extractall(out_dir)

print(f"Extracting {large_arhive}")
with tarfile.open(large_arhive) as tar:
  tar.extractall(out_dir)

Extracting data/not_mnist/notMNIST_small.tar.gz
Extracting data/not_mnist/notMNIST_large.tar.gz


In [0]:
import numpy as np
from pathlib import Path
from PIL import Image

def remove_duplicates(img_train, labels_train, img_test):
    img_new, labels_new = [], []
    test_set = {e.tostring() for e in img_test}
    for i, (x, y) in enumerate(zip(img_train, labels_train)):
        if x.tostring() not in test_set:
            img_new.append(x)
            labels_new.append(y)

    print(f'Removed {img_train.shape[0] - len(img_new)} duplicated images')
    return np.array(img_new), np.array(labels_new)

def load_images(path, n):
    labels = ['I', 'G', 'A', 'F', 'H', 'J', 'C', 'D', 'E', 'B']

    x, y = [], []
    for i, l in enumerate(labels):
        d = Path(path) / l
        print(f'Loading {str(d)} ', end='')
        for j, f in zip(range(n), d.iterdir()):
            try:
                with Image.open(f) as img:
                    x.append(np.array(img))
                    y.append(i)
            except OSError:
                pass
            if j % 1000 == 0:
                print('.', end='', flush=True)
        print(flush=True)
    return np.array(labels), np.array(x), np.array(y)

def load_not_mnist_data(path='data/not_mnist/', use_cache=True):
    train_folder = Path(path) / 'notMNIST_large'
    test_folder = Path(path) / 'notMNIST_small'

    train_cache_file = Path(path) / 'train.npz'
    test_cache_file = Path(path) / 'test.npz'

    if train_cache_file.exists() and test_cache_file.exists() and use_cache:
        f = np.load(train_cache_file)
        labels, img_train, labels_train = [v for k, v in f.items()]
        f = np.load(test_cache_file)
        labels, img_test, labels_test = [v for k, v in f.items()]
        print('Loaded cached arrays')

    else:
        labels, img_train, labels_train = load_images(train_folder, 10000000)
        labels, img_test, labels_test = load_images(test_folder, 10000000)
        np.savez(train_cache_file, labels, img_train, labels_train)
        np.savez(test_cache_file, labels, img_test, labels_test)

    return labels, img_train, labels_train, img_test, labels_test

In [5]:
load_not_mnist_data()

Loading data/not_mnist/notMNIST_large/I .....................................................
Loading data/not_mnist/notMNIST_large/G .....................................................
Loading data/not_mnist/notMNIST_large/A .....................................................
Loading data/not_mnist/notMNIST_large/F .....................................................
Loading data/not_mnist/notMNIST_large/H .....................................................
Loading data/not_mnist/notMNIST_large/J .....................................................
Loading data/not_mnist/notMNIST_large/C .....................................................
Loading data/not_mnist/notMNIST_large/D .....................................................
Loading data/not_mnist/notMNIST_large/E .....................................................
Loading data/not_mnist/notMNIST_large/B .....................................................
Loading data/not_mnist/notMNIST_small/I ..
Loading data/not_

(array(['I', 'G', 'A', 'F', 'H', 'J', 'C', 'D', 'E', 'B'], dtype='<U1'),
 array([[[255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         ...,
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255]],
 
        [[  0,  94, 163, ..., 213, 212, 114],
         [  0, 101, 255, ..., 207, 255,  87],
         [  0,  21, 243, ..., 218, 247,  31],
         ...,
         [147, 255,  60, ..., 255,  13,   0],
         [205, 255, 249, ..., 255,  46,   0],
         [109, 111, 110, ..., 255, 140,   0]],
 
        [[247, 255, 255, ..., 255, 255, 249],
         [126, 146, 148, ..., 147, 146, 127],
         [  0,   0,   0, ...,   0,   0,   0],
         ...,
         [  0,   0,   0, ...,   0,   0,   0],
         [128, 148, 149, ..., 149, 148, 131],
         [249, 255, 255, ..., 255, 255, 246]],
 
        ...,
 
        [[129, 145, 159, ...,   0

In [6]:
labels, img_train, labels_train, img_test, labels_test = load_not_mnist_data()

Loaded cached arrays


In [7]:
img_train, labels_train = remove_duplicates(img_train, labels_train, img_test)

Removed 12213 duplicated images


In [0]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import train_test_split

def flatten(a):
    return a.reshape(a.shape[0], a.shape[1] * a.shape[2])

def load_data():
    labels, img_train, labels_train, img_test, labels_test = load_not_mnist_data()
    img_train, labels_train = remove_duplicates(img_train, labels_train, img_test)
    return labels, flatten(img_train), labels_train, flatten(img_test), labels_test

In [9]:

labels, img_train, labels_train, img_test, labels_test = load_not_mnist_data()
img_train, labels_train = remove_duplicates(img_train, labels_train, img_test)
img_train = np.reshape(img_train, img_train.shape + (1,))
img_test =  np.reshape(img_test, img_test.shape + (1,))   

Loaded cached arrays
Removed 12213 duplicated images


In [11]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, Sequential
from keras.callbacks import EarlyStopping

results = {}
results.setdefault('val_acc', {})

model = Sequential([
            layers.Conv2D(16, 5, activation='relu', input_shape=(28, 28, 1)),
            layers.Conv2D(16, 5, activation='relu'),
            layers.Flatten(),
            layers.Dense(100, activation='relu'),
            layers.Dense(10, activation='softmax')
        ])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=3)
model.fit(x=img_train, y=labels_train, epochs=10,
          validation_split=0.1)

print('\n# Evaluate')
result = model.evaluate(img_test, labels_test)
print(result)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

# Evaluate
[0.7155035734176636, 0.9296624660491943]


In [12]:
print(result)

[0.7155035734176636, 0.9296624660491943]


In [14]:
model2 = Sequential([
            layers.Conv2D(16, 5, activation='relu', input_shape=(28,28,1), padding='same'),
            layers.MaxPool2D(pool_size=(2, 2), padding='same'),
            layers.Flatten(),
            layers.Dense(100, activation='relu'),
            layers.Dense(10, activation='softmax')
        ])

model2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=3)
model2.fit(x=img_train, y=labels_train, epochs=10,
          validation_split=0.1)

print('\n# Evaluate')
result2 = model2.evaluate(img_test, labels_test)
print(result2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

# Evaluate
[0.3851199150085449, 0.9114505648612976]


In [15]:
model3 = Sequential([
            layers.Conv2D(6, 5, activation='tanh', input_shape=(28,28,1)),
            layers.AvgPool2D(pool_size=(2, 2), strides=(2, 2)),
            layers.Conv2D(16, 5, activation='tanh'),
            layers.AvgPool2D(pool_size=(2, 2), strides=(2, 2)),
            layers.Conv2D(120, 4, activation='tanh'),
            layers.Flatten(),
            layers.Dense(84, activation='tanh'),
            layers.Dense(10, activation='softmax')
        ])

model3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model3.fit(x=img_train, y=labels_train, epochs=10,
          validation_split=0.1)

print('\n# Evaluate')
result3 = model3.evaluate(img_test, labels_test)
print(result3)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

# Evaluate
[0.23152212798595428, 0.9423733949661255]
