In [1]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
from scipy import ndimage
from __future__ import print_function
from IPython.display import Image
from six.moves import cPickle as pickle
%matplotlib inline

In [2]:
cwd = os.getcwd()
train_folder = os.path.join(cwd, 'train')
test_folder = os.path.join(cwd, 'test')
extra_folder = os.path.join(cwd, 'extra')

In [3]:
def remove_anomaly_samples(in_data, max_class_length = 5):
    """
    Remove all the data which has a class length higher than the 'max_class_length' value defined above.
    """
    print("\n Dataset size is %d before removing images greater than class length %d" % (len(in_data), max_class_length))
    
    for j in range(len(in_data)):
        if j < len(in_data) and len(in_data[j]['label']) > max_class_length:
            print("\nAnomaly at index %d detected with class size %d" % (j, len(in_data[j]['label'])))
            del in_data[j]
            
    print("\nDataset resized to %d after removing images (greater than class length %d)" % (len(in_data), max_class_length))
    return in_data

In [4]:
import h5py

# The DigitStructFile is a wrapper around the h5py data. It contains 
#    Name: file names
#    Bbox: all struct data about each digit in an image

class DigitStructsWrapper:
    def __init__(self, in_file, in_start = 0, in_end = 0):
        self.in_file = h5py.File(in_file, 'r')
        self.names = self.in_file['digitStruct']['name'][in_start:in_end] if in_end > 0 else self.in_file['digitStruct']['name']
        self.bboxes = self.in_file['digitStruct']['bbox'][in_start:in_end] if in_end > 0 else self.in_file['digitStruct']['bbox']
        self.collectionSize = len(self.names)
        print("\n%s file contains %d entries" % (in_file, self.collectionSize))

    def getAllNumbersRestructured(self): 
        """ Method return a restructured version of the dataset (one object per digit in 'boxes').
            Returns a list of dicts:
              'filename' : filename of the image
              'boxes' : list of dicts - one per digit
                  'label' : 1 to 9 corresponding digits. 10 for digit '0'.
                  'left', 'top' : position of bounding box
                  'width', 'height' : dimension of bounding box
        """ 
        allImagesData = self.getAllNumbersStructure()
        print("\nSample image object structure before transforming: ", allImagesData[0])
        remove_anomaly_samples(allImagesData)
        
        result = []
        for imgData in allImagesData:
            metadatas = []
            for i in range(len(imgData['height'])):
                metadata = {}
                metadata['height'] = imgData['height'][i]
                metadata['label']  = imgData['label'][i]
                metadata['left']   = imgData['left'][i]
                metadata['top']    = imgData['top'][i]
                metadata['width']  = imgData['width'][i]
                metadatas.append(metadata)
                
            result.append({ 'boxes':metadatas, 'name':imgData["name"] })
            
        print("\nSample image object structure after transforming: ", result[0])
        
        return result

    def bboxHelper(self, keys_):
        """ Method handles the difference when there is exactly one bbox or an array of bboxes due to multi digit image. """
        if (len(keys_) > 1):
            val = [self.in_file[keys_.value[j].item()].value[0][0] for j in range(len(keys_))]
        else:
            val = [keys_.value[0][0]]
        return val

    def getNumberStructure(self,n):
        """ Method returns the bbox for all digits and name together for an image. """
        s = self.getBbox(n)
        s['name']=self.getName(n)
        return s

    def getAllNumbersStructure(self):
        """ Method returns an array containing position & label info about every image. """
        return [self.getNumberStructure(i) for i in range(self.collectionSize)]

    def getName(self, n):
        """ Method returns the filename of the image. chr function converts from ASCII to char. """
        return ''.join([chr(c[0]) for c in self.in_file[self.names[n][0]].value])
    
    def getBbox(self, n):
        """ Method returns bboxes for all digits in an image as a dict. """
        bbox = {}
        bb = self.bboxes[n].item()
        bbox['height'] = self.bboxHelper(self.in_file[bb]["height"])
        bbox['left'] = self.bboxHelper(self.in_file[bb]["left"])
        bbox['top'] = self.bboxHelper(self.in_file[bb]["top"])
        bbox['width'] = self.bboxHelper(self.in_file[bb]["width"])
        bbox['label'] = self.bboxHelper(self.in_file[bb]["label"])
        return bbox

    

In [5]:
train_digitStruct = os.path.join(train_folder, 'digitStruct.mat')
dsf_obj = DigitStructsWrapper(train_digitStruct)
train_data = dsf_obj.getAllNumbersRestructured()


/home/ammu/Documents/SVHN/train/digitStruct.mat file contains 33402 entries

Sample image object structure before transforming:  {'name': '1.png', 'top': [77.0, 81.0], 'label': [1.0, 9.0], 'width': [81.0, 96.0], 'height': [219.0, 219.0], 'left': [246.0, 323.0]}

Dataset size is 33402 before removing images greater than class length 5

Anomaly at index 29929 detected with class size 6

Dataset resized to 33401 after removing images (greater than class length 5)

Sample image object structure after transforming:  {'boxes': [{'width': 81.0, 'top': 77.0, 'label': 1.0, 'left': 246.0, 'height': 219.0}, {'width': 96.0, 'top': 81.0, 'label': 9.0, 'left': 323.0, 'height': 219.0}], 'name': '1.png'}


In [6]:
test_digitStruct = os.path.join(test_folder, 'digitStruct.mat')
dsf_obj = DigitStructsWrapper(test_digitStruct)
test_data = dsf_obj.getAllNumbersRestructured()


/home/ammu/Documents/SVHN/test/digitStruct.mat file contains 13068 entries

Sample image object structure before transforming:  {'name': '1.png', 'top': [7.0], 'label': [5.0], 'width': [19.0], 'height': [30.0], 'left': [43.0]}

Dataset size is 13068 before removing images greater than class length 5

Dataset resized to 13068 after removing images (greater than class length 5)

Sample image object structure after transforming:  {'boxes': [{'width': 19.0, 'top': 7.0, 'label': 5.0, 'left': 43.0, 'height': 30.0}], 'name': '1.png'}


In [7]:
extra_digitStruct = os.path.join(extra_folder, 'digitStruct.mat')
dsf_obj = DigitStructsWrapper(extra_digitStruct)
extra_data = dsf_obj.getAllNumbersRestructured()


/home/ammu/Documents/SVHN/extra/digitStruct.mat file contains 202353 entries

Sample image object structure before transforming:  {'name': '1.png', 'top': [70.0, 41.0, 23.0], 'label': [4.0, 7.0, 8.0], 'width': [38.0, 36.0, 47.0], 'height': [56.0, 56.0, 56.0], 'left': [24.0, 55.0, 79.0]}

Dataset size is 202353 before removing images greater than class length 5

Dataset resized to 202353 after removing images (greater than class length 5)

Sample image object structure after transforming:  {'boxes': [{'width': 38.0, 'top': 70.0, 'label': 4.0, 'left': 24.0, 'height': 56.0}, {'width': 36.0, 'top': 41.0, 'label': 7.0, 'left': 55.0, 'height': 56.0}, {'width': 47.0, 'top': 23.0, 'label': 8.0, 'left': 79.0, 'height': 56.0}], 'name': '1.png'}


In [8]:
from PIL import Image

def print_data_stats(data, folder):
    """ Gives basic stats about the image datasets. """
    data_imgSize = np.ndarray([len(data),2])

    for i in np.arange(len(data)):
        filename = data[i]['name']
        filepath = os.path.join(folder, filename)
        data_imgSize[i, :] = Image.open(filepath).size[:]

    max_w, max_h = np.amax(data_imgSize[:,0]), np.amax(data_imgSize[:,1])
    min_w, min_h = np.amin(data_imgSize[:,0]), np.amin(data_imgSize[:,1])
    mean_w, mean_h = np.mean(data_imgSize[:,0]), np.mean(data_imgSize[:,1])
    print("folder", folder, "has max width", max_w, "and max height", max_h) 
    print("folder", folder, "has min width", min_w, "and min height", min_h)
    print("folder", folder, "has mean width", mean_w, "and mean height", mean_h, "\n")
    
    max_w_i, max_h_i = np.where(data_imgSize[:,0] == max_w), np.where(data_imgSize[:,1] == max_h)
    print("folder", folder, "has max width indicies at:", max_w_i) 
    print("folder", folder, "has max height indicies at:", max_h_i, "\n")    
    
    min_w_i, min_h_i = np.where(data_imgSize[:,0] == min_w), np.where(data_imgSize[:,1] == min_h)
    print("folder", folder, "has min width indicies at:", min_w_i) 
    print("folder", folder, "has min height indicies at:", min_h_i, "\n*********\n")

In [9]:
print_data_stats(train_data, train_folder)
print_data_stats(test_data, test_folder)
print_data_stats(extra_data, extra_folder)

folder /home/ammu/Documents/SVHN/train has max width 876.0 and max height 501.0
folder /home/ammu/Documents/SVHN/train has min width 25.0 and min height 12.0
folder /home/ammu/Documents/SVHN/train has mean width 128.286338732 and mean height 57.2139456902 

folder /home/ammu/Documents/SVHN/train has max width indicies at: (array([  410,  4163, 15855, 30483]),)
folder /home/ammu/Documents/SVHN/train has max height indicies at: (array([15855]),) 

folder /home/ammu/Documents/SVHN/train has min width indicies at: (array([9747]),)
folder /home/ammu/Documents/SVHN/train has min height indicies at: (array([ 1813,  2291,  4829,  5691,  9488,  9747,  9831, 10175, 10938,
       14902, 16284, 20314, 20775, 21544, 22330, 24015, 25438, 26047,
       26345, 27062, 27160, 27593, 27959, 29526, 29701, 30064, 30089,
       30462, 30947, 32339, 32351, 32539, 32567, 33141, 33180, 33202]),) 
*********

folder /home/ammu/Documents/SVHN/test has max width 1083.0 and max height 516.0
folder /home/ammu/Docume

In [10]:
max_size_img = 32 #every image in the dataset will be resized to 32x32

def prepare_images(cases, folder):
    print("Processing begins from %s folder" % folder)
    
    images_ready = np.ndarray([len(cases),max_size_img,max_size_img,1], dtype='float32')
    proper_numbers = np.ones([len(cases),6], dtype=int) * 10
    files = []

    for i in range(len(cases)):
        filename = cases[i]['name']
        filepath = os.path.join(folder, filename)
        image = Image.open(filepath)
        boxes = cases[i]['boxes']
        number_length = len(boxes)
        files.append(filename)
        
        # at index 0 length of a label is stored. e.g: 5 -> 1; 234-> 3, 34567 -> 5 etc
        proper_numbers[i,0] = number_length
        
        top = np.ndarray([number_length], dtype='float32')
        left = np.ndarray([number_length], dtype='float32')
        height = np.ndarray([number_length], dtype='float32')
        width = np.ndarray([number_length], dtype='float32')
        
        for j in range(number_length):            
            proper_numbers[i,j+1] = boxes[j]['label'] # here we use j+1, since first entry is used by label length
            if boxes[j]['label'] == 10: # Replacing 10 with 0.. this is an important note!
                proper_numbers[i,j+1] = 0
                
            top[j] = boxes[j]['top']
            left[j] = boxes[j]['left']
            height[j] = boxes[j]['height']
            width[j] = boxes[j]['width']
        
        minimum_image_top = np.amin(top)
        minimum_image_left = np.amin(left)
        height_of_image = np.amax(top) + height[np.argmax(top)] - minimum_image_top
        width_of_image = np.amax(left) + width[np.argmax(left)] - minimum_image_left

        image_left = np.floor(minimum_image_left - 0.1 * width_of_image)
        image_top = np.floor(minimum_image_top - 0.1 * height_of_image)
        image_right = np.amin([np.ceil(image_left + 1.2 * width_of_image), image.size[0]])
        image_bottom = np.amin([np.ceil(image_top + 1.2 * height_of_image), image.size[1]])
    
        image = image.crop((int(image_left), int(image_top), int(image_right), int(image_bottom))).resize([max_size_img, max_size_img], Image.ANTIALIAS) # Resize image to 32x32
        image = np.dot(np.array(image, dtype='float32'), [[0.2989],[0.5870],[0.1140]]) # Convert image to grayscale using a known technique

        mean = np.mean(image, dtype='float32')
        std = np.std(image, dtype='float32', ddof=1)
        if std < 0.0001: 
            std = 1.0
        image = (image - mean) / std
        images_ready[i,:,:] = image[:,:,:]
        
    print("Processing from %s folder completed. Dataset is cropped, resized and grayscaled." % folder)
    
    return images_ready, proper_numbers, files

In [11]:
train_data, train_labels, _ = prepare_images(train_data, train_folder)
print("train_data shape:", train_data.shape)
print("train_labels shape:", train_labels.shape)

Starting the processing of images from /home/ammu/Documents/SVHN/train folder for convnet...
Processing of images from /home/ammu/Documents/SVHN/train folder completed. Images have been cropped, resized and grayscaled.
train_data shape: (33401, 32, 32, 1)
train_labels shape: (33401, 6)


In [12]:
test_data, test_labels, test_filenames = prepare_images(test_data, test_folder)
print("test_data shape:", test_data.shape)
print("test_labels shape:", test_labels.shape)

Starting the processing of images from /home/ammu/Documents/SVHN/test folder for convnet...
Processing of images from /home/ammu/Documents/SVHN/test folder completed. Images have been cropped, resized and grayscaled.
test_data shape: (13068, 32, 32, 1)
test_labels shape: (13068, 6)


In [13]:
extra_data, extra_labels, _ = prepare_images(extra_data, extra_folder)
print("extra_data shape:", extra_data.shape)
print("extra_labels shape:", extra_labels.shape)

Starting the processing of images from /home/ammu/Documents/SVHN/extra folder for convnet...
Processing of images from /home/ammu/Documents/SVHN/extra folder completed. Images have been cropped, resized and grayscaled.
extra_data shape: (202353, 32, 32, 1)
extra_labels shape: (202353, 6)


In [14]:
from sklearn.utils import shuffle

# Here more samples (50k) from extra dataset are added to training set.
train_data_temp = np.concatenate((train_data, extra_data[:50000, :, :, :])) 
# Then remove those samples from extra
extra_data_temp = np.delete(extra_data, np.arange(50000), axis=0) 

train_labels_temp = np.concatenate((train_labels, extra_labels[:50000]))
extra_labels_temp = np.delete(extra_labels, np.arange(50000), axis=0)

# And then all data within each dataset is shuffled 
train_data_temp, train_labels_temp = shuffle(train_data_temp, train_labels_temp)
extra_data_temp, extra_labels_temp = shuffle(extra_data_temp, extra_labels_temp)
test_data_temp, test_labels_temp, test_filenames_temp = shuffle(test_data, test_labels, test_filenames)

In [15]:
print("Train shapes:", train_data_temp.shape, train_labels_temp.shape)
print("Extra shapes:", extra_data_temp.shape, extra_labels_temp.shape)
print("Test shapes:", test_data_temp.shape, test_labels_temp.shape)

Train shapes: (83401, 32, 32, 1) (83401, 6)
Extra shapes: (152353, 32, 32, 1) (152353, 6)
Test shapes: (13068, 32, 32, 1) (13068, 6)


In [16]:
pickle_file = 'SVHN.pickle'

try:
    f = open(pickle_file, 'wb')
    save = {
        'train_data': train_data_temp,
        'train_labels': train_labels_temp,
        'test_data': test_data_temp,
        'test_labels': test_labels_temp,
        'test_filenames': test_filenames_temp,
        'valid_data': extra_data_temp, # The rest of extra data will be used 
        'valid_labels': extra_labels_temp # for validation during model training
        }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise

In [17]:
statinfo = os.stat(pickle_file)
print('Compressed pickle size is', statinfo.st_size, 'bytes or', round(float(statinfo.st_size)/1073741824,2), 'GBs')

Compressed pickle size is 1031316145 bytes or 0.96 GBs
