# What is this?

Because MNIST does not contain symbols (X/-+) I needed to make my own. I made images of the digits and symbols in 10 fonts, size 28x28 pixels. These files are contained in the folder 'base_images_artificial/'. This file takes those images, and exports them into the folder 'distorted_from_artificial/' after applying multiple distortions.


In [39]:
import os
import string
import random
import math
from hashlib import sha1

import numpy as np

from scipy import ndimage as ndi
from scipy.misc import imsave

from skimage.filters import gabor, gaussian
from skimage.transform import resize, warp
from skimage.util import random_noise
from skimage.morphology import dilation, erosion, rectangle, diamond, disk
from skimage.transform._geometric import ProjectiveTransform



Random string generator for file names

In [33]:
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
    return ''.join(random.choice(chars) for _ in range(size))

In [34]:
def save_image(folder, image):
    suffix = id_generator()
    while suffix in UNIQUE_IDS:
        suffix = id_generator()
    UNIQUE_IDS.add(suffix)
    extension = '.jpg'
    filename = "_".join([folder, suffix]) + extension
    output_file = os.path.join(SAVED_IMAGE_DIR, folder, filename)

    imsave(output_file, image)
    

In [35]:
BASE_IMAGE_DIR = 'base_images_artificial/'
SAVED_IMAGE_DIR = 'distorted_from_artificial/'
DIR_LIST = ['0','1','2','3','4','5','6','7','8','9','X','plus','minus','div']
IMAGE_SIZE = 28
UNIQUE_IDS = set()
UNIQUE_IMAGES = set()


Function for cleaning out the folders where new images will be saved to

In [36]:
def clear_saved_filtered_images():
    for folder in DIR_LIST:
        for image in os.listdir(SAVED_IMAGE_DIR + folder):
            image_file = os.path.join(SAVED_IMAGE_DIR, folder, image)
            os.remove(image_file)
    
    #Reinitialize UNIQUE_IDS and UNIQUE_IMAGES
    UNIQUE_IDS.clear()
    UNIQUE_IMAGES.clear()
            

Apply filters

In [46]:
def apply_filters_and_save():
    
    for folder in DIR_LIST:
        dataset = np.ndarray(shape=(10, IMAGE_SIZE, IMAGE_SIZE),dtype=np.float32)

        #Read each image from the folder in BASE_IMAGE_DIR
        image_index=0
        for image in os.listdir(BASE_IMAGE_DIR + folder):
            image_file = os.path.join(BASE_IMAGE_DIR, folder, image)
            try:
                image_data = ndi.imread(image_file, mode = 'L')
                image_data = resize(image_data, (IMAGE_SIZE,IMAGE_SIZE))
                #print(image_data.shape)

                assert image_data.shape == (IMAGE_SIZE, IMAGE_SIZE)

                dataset[image_index, :, :] = image_data
                image_index+=1
            except IOError as e:
                print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
                
        #Apply filters and save each time
        #Sha1 function is used for hashing the numpy arrays and preventing
        #Exact duplicates of images
        for image in dataset:
            #default image
            save_image(folder, image)
            UNIQUE_IMAGES.add(sha1(image))
            
            
            #random_noise
            new_image = random_noise(image, 'gaussian')
            if sha1(new_image) not in UNIQUE_IMAGES:
                save_image(folder, new_image)
                UNIQUE_IMAGES.add(sha1(new_image))
            
            #translate image slightly. inversion is for preventing 0's from filling border
            for i in range(-2,3):
                for j in range(-2,3):
                    new_image = 1-warp(1-image,ProjectiveTransform(np.array([[1, 0, i],[0,1,j],[0,0,1]])))
                    if sha1(new_image) not in UNIQUE_IMAGES:
                        save_image(folder, new_image)
                        UNIQUE_IMAGES.add(sha1(new_image))
            
            #erosions and dilations
            for i in range(1,2):    
                #rectangle/squares
                for j in range(1,2):
                    new_image = erosion(image, rectangle(i,i))
                    if sha1(new_image) not in UNIQUE_IMAGES:
                        save_image(folder, new_image)
                        UNIQUE_IMAGES.add(sha1(new_image))

                #circles
                new_image = erosion(image, disk(i))
                if sha1(new_image) not in UNIQUE_IMAGES:
                    save_image(folder, new_image)
                    UNIQUE_IMAGES.add(sha1(new_image))
            
            #gaussian filter
            for sig in range(1,11, 1):   
                new_image = gaussian(image, sig*.1)
                if sha1(new_image) not in UNIQUE_IMAGES:
                    save_image(folder, new_image)
                    UNIQUE_IMAGES.add(sha1(new_image))
                
            #image = gabor(image, 2, 1*math.pi/10)[1]

clear_saved_filtered_images()
apply_filters_and_save()