In [None]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
import pickle
import scipy.io
import idx2numpy
import random
from matplotlib import pyplot as plt

In [None]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    
    del save  # hint to help gc free up memory
    
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

In [None]:
def plot_img(image, title):
    plt.imshow(image)
    plt.title(title)
    plt.show()

In [None]:
def conv_to_greyscale(image):
    grey = np.zeros((image.shape[0], image.shape[1])) # init 2D numpy array
    # get row number
    def average(pixel):
        return np.average(pixel)
        #return 0.299*pixel[0] + 0.587*pixel[1] + 0.114*pixel[2]

    for rownum in range(len(image)):
        for colnum in range(len(image[rownum])):
            grey[rownum][colnum] = average(image[rownum][colnum])
    
    return grey

In [None]:
white_image_old = np.full((32, 32, 3), 1.)

In [None]:
white_image = np.full((28, 28), 0)

In [None]:
plot_img(white, 'white')
plot_img(conv_to_greyscale(white), 'white')

In [None]:
train_images = idx2numpy.convert_from_file('train-images-idx3-ubyte')
train_label = idx2numpy.convert_from_file('train-labels-idx1-ubyte')

test_images = idx2numpy.convert_from_file('t10k-images-idx3-ubyte')
test_label = idx2numpy.convert_from_file('t10k-labels-idx1-ubyte')

In [None]:
print(train_images.shape)
print(train_label.shape)
print(test_images.shape)
print(test_label.shape)

In [None]:
train = {}
test = {}

train['X'] = train_images
train['y'] = train_label

test['X'] = test_images
test['y'] = test_label

In [None]:
scipy.io.savemat('mnist_digits_train_28x28.mat', train)
scipy.io.savemat('mnist_digits_test_28x28.mat', test)

In [None]:
def random_insert_seq(lst, seq):
    insert_locations = random.sample(xrange(len(lst) + len(seq)), len(seq))
    inserts = dict(zip(insert_locations, seq))
    input = iter(lst)
    lst[:] = [inserts[pos] if pos in inserts else next(input)
        for pos in xrange(len(lst) + len(seq))]

In [None]:
# Assuming image have equal height and width
def randomize_inputs(X_training, Y_labelling, no_of_white_images):
    w = X_training.shape[1]
    n = X_training.shape[0]
    k = no_of_white_images
    print ('w : ', w, ' n : ', n, ' k : ', k)
    
    Y_expand_label = np.zeros((n, 1, w), dtype=np.int)
    for i in range(0,n):
        Y_expand_label[i,:,:] = Y_labelling[i]
        
    data_expand = np.concatenate((X_training ,Y_expand_label), axis=1)
    
    assert (data_expand.shape[1] == w + 1)
    
    white_image_mat = np.zeros((k, w, w))
    white_image_label = np.full((k, 1, w), None)
    
    white_images_big = np.concatenate((white_image_mat ,white_image_label), axis=1)
    
    assert (white_images_big.shape[1] == w + 1)
    
    data_big = np.concatenate((data_expand ,white_images_big))
    
    assert (data_big.shape[0] == n + k and data_big.shape[1] == w + 1 and data_big.shape[2] == w)
    
    np.random.shuffle(data_big)
    assert (data_big.shape[0] == n + k and data_big.shape[1] == w + 1 and data_big.shape[2] == w)
    
    count = 0
    list_of_outputs = [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.]
    for i in range(0, n + k):
        if data_big[i][28][0] not in list_of_outputs :
            count += 1

    assert (count == k)
    
    Y_out = data_big[:,w][:,0]
    X_out = np.delete(data_big, w, axis=1)
    
#     Y_out = np.zeros((n+k))
#     for y in range(n+k) :
#         Y_out[y] = (int(Y_o[y]))
#     # Y_out.astype(int)
    
    return X_out, Y_out
    

In [None]:
def stack_matrices_horizontally(list_of_matrices):
    return np.hstack((list_of_matrices))

In [None]:
def stack_labels_horizontally(list_of_outputs):
    possible_list = [0,1,2,3,4,5,6,7,8,9]
    
    outs = []
    for i in list_of_outputs :
        if i in possible_list :
            outs.append(int(i))
    
    l = len(outs)
#     print(l)
#     print(outs)
    stra = ''
    for j in outs:
        a = str(j)
        stra += a
        
#     print(stra, '   length : ' , len(stra))
    
#     if '' not in stra:
#         print ('contains empty string')
#         try:
#             int('')
#         except ValueError:
#             pass 
        
    if len(stra) != 0:
        final_out = (stra)
        return final_out
    else :
        return (-1)
    
#     return final_out

In [None]:
def random_list_length_generator(start_pt, end_pt, part_length, list_length) :
    random_list = []
    for i in range(list_length):
        random_list.append(random.sample(range(start_pt, end_pt), part_length))
        
    return random_list

In [None]:
def multiple_img_dataset_generator(X_dataset, Y_dataset, no_of_iamges_to_combine, length_of_new_dataset) :
    assert (X_dataset.shape[0] == Y_dataset.shape[0])
    
    n = X_dataset.shape[0]
    w = X_dataset.shape[1]        # Assuming image have same width and height
    c = no_of_iamges_to_combine
    k = length_of_new_dataset
    
    rand_list = random_list_length_generator(0, n, c, k)
    
    # To retain Zeros before the number, like 005 or 0670,
    # Y_train_multi is made to store objects of type 'String'
    X_new = np.zeros((k, w, c*w))
    Y_new = np.zeros((k), dtype='|S6')
    
    for i in range(k):
        rand_index = rand_list[i]

        images_index = []
        label_index = []
        for r in rand_index :
            images_index.append(X_dataset[r])
            label_index.append(Y_dataset[r])

        stacked_images = stack_matrices_horizontally(images_index)
        stacked_labels = stack_labels_horizontally(label_index)

        if (i % 5000 == 0) :
            print (stacked_labels)

    #     print('Images and labels converted into stacks of 5 successfully')
    #     print('Converting into Matrix')

        X_new[i] = stacked_images
        Y_new[i] = stacked_labels
    
    print('Matrix Conversion Successful')
    
    return X_new, Y_new
    

In [None]:
X_train_new, Y_train_new = randomize_inputs(train_images, train_label, 20000)

In [None]:
X_train_multi, Y_train_multi = multiple_img_dataset_generator(X_train_new, Y_train_new, 5, 50000)

In [None]:
X_test_new, Y_test_new = randomize_inputs(test_images, test_label, 2500)

In [None]:
X_test_multi, Y_test_multi = multiple_img_dataset_generator(X_test_new, Y_test_new, 5, 8000)

In [None]:
print (X_train_new.shape, Y_train_new.shape)
print (X_train_multi.shape, Y_train_multi.shape)
print (X_test_new.shape, Y_test_new.shape)
print (X_test_multi.shape, Y_test_multi.shape)

In [None]:
train_multi = {}
test_multi = {}

train_multi['X'] = X_train_multi
train_multi['y'] = Y_train_multi

test_multi['X'] = X_test_multi
test_multi['y'] = Y_test_multi

In [None]:
scipy.io.savemat('mnist_multi_digits_train_28x140.mat', train)
scipy.io.savemat('mnist_multi_digits_test_28x140.mat', test)