In [2]:
from datetime import datetime
import os
import numpy as np
import imageio
import scipy.io

In [17]:
chars = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']

In [16]:
small_folder = './notMNIST_small'
large_folder = './notMNIST_large'

In [38]:
small_orig_mat = './notMNIST_small_orig.mat'
large_orig_mat = './notMNIST_large_orig.mat'
small_uniq_mat = './notMNIST_small_uniq.mat'
large_uniq_mat = './notMNIST_large_uniq.mat'

In [28]:
def folder_to_mat(folder, output_file):
    data = {}
    start = datetime.now()
    for ch in chars:
        char_images = list()
        char_img_path = f'{folder}/{ch}'

        print(f'Reading "{ch}"')

        for r, d, f in os.walk(char_img_path):
            for file in f:
                if '.png' in file:
                    file_path = os.path.join(r, file)
                    try:
                        img = imageio.imread(file_path)
                        char_images.append(img.reshape(-1))
                    except (OSError, ValueError) as e:
                        #print(e)
                        print('Bad file:', file_path)
        data[ch] = np.array(char_images)
    end = datetime.now()
    time_diff = (end - start).total_seconds()
    print(f'{end.strftime("%H:%M:%S")} it took: {time_diff}\'s')
    scipy.io.savemat(output_file, data)

In [29]:
folder_to_mat(small_folder, small_orig_mat)

Reading "A"
Bad file: ./notMNIST_small/A/RGVtb2NyYXRpY2FCb2xkT2xkc3R5bGUgQm9sZC50dGY=.png
Reading "B"
Reading "C"
Reading "D"
Reading "E"
Reading "F"
Bad file: ./notMNIST_small/F/Q3Jvc3NvdmVyIEJvbGRPYmxpcXVlLnR0Zg==.png
Reading "G"
Reading "H"
Reading "I"
Reading "J"
21:15:42 it took: 7.583626's


In [30]:
folder_to_mat(large_folder, large_orig_mat)

Reading "A"
Bad file: ./notMNIST_large/A/RnJlaWdodERpc3BCb29rSXRhbGljLnR0Zg==.png
Bad file: ./notMNIST_large/A/Um9tYW5hIEJvbGQucGZi.png
Bad file: ./notMNIST_large/A/SG90IE11c3RhcmQgQlROIFBvc3Rlci50dGY=.png
Reading "B"
Bad file: ./notMNIST_large/B/TmlraXNFRi1TZW1pQm9sZEl0YWxpYy5vdGY=.png
Reading "C"
Reading "D"
Bad file: ./notMNIST_large/D/VHJhbnNpdCBCb2xkLnR0Zg==.png
Reading "E"
Reading "F"
Reading "G"
Reading "H"
Reading "I"
Reading "J"
21:22:58 it took: 395.27648's


In [48]:
def remove_duplicates(data):
    uniq_data = {}
    total_start = datetime.now()
    for ch in chars:
        start = datetime.now()
        print('------')
        print(f'{start.strftime("%H:%M:%S")}: Uniq: "{ch}"')
        uniq_data[ch] = np.unique(data[ch], axis=0)
        end = datetime.now()
        time_diff = (end - start).total_seconds()
        print(f'{end.strftime("%H:%M:%S")}: "{ch}" - diff: {len(data[ch]) - len(uniq_data[ch])}\t took: {time_diff}')

    total_end = datetime.now()
    total_time_diff = (total_end - total_start).total_seconds()
    print(f'\n{total_end.strftime("%H:%M:%S")} it took: {total_time_diff}\'s')
    return uniq_data

In [49]:
small_orig_data = scipy.io.loadmat(small_orig_mat)

In [50]:
small_uniq_data = remove_duplicates(small_orig_data)

------
21:33:17: Uniq: "A"
21:33:17: "A" - diff: 24	 took: 0.015218
------
21:33:17: Uniq: "B"
21:33:17: "B" - diff: 20	 took: 0.013867
------
21:33:17: Uniq: "C"
21:33:17: "C" - diff: 24	 took: 0.013742
------
21:33:17: Uniq: "D"
21:33:17: "D" - diff: 26	 took: 0.012222
------
21:33:17: Uniq: "E"
21:33:17: "E" - diff: 26	 took: 0.013193
------
21:33:17: Uniq: "F"
21:33:17: "F" - diff: 22	 took: 0.011418
------
21:33:17: Uniq: "G"
21:33:17: "G" - diff: 21	 took: 0.012018
------
21:33:17: Uniq: "H"
21:33:17: "H" - diff: 26	 took: 0.009388
------
21:33:17: Uniq: "I"
21:33:17: "I" - diff: 275	 took: 0.03847
------
21:33:17: Uniq: "J"
21:33:17: "J" - diff: 22	 took: 0.014507


In [121]:
uniq_data = {}


------
19:05:02: Uniq: "A"
19:05:02: "A" - diff: 24	 took: 0.014771
------
19:05:02: Uniq: "B"
19:05:02: "B" - diff: 20	 took: 0.017863
------
19:05:02: Uniq: "C"
19:05:02: "C" - diff: 24	 took: 0.016728
------
19:05:02: Uniq: "D"
19:05:02: "D" - diff: 26	 took: 0.01262
------
19:05:02: Uniq: "E"
19:05:02: "E" - diff: 26	 took: 0.013535
------
19:05:02: Uniq: "F"
19:05:02: "F" - diff: 22	 took: 0.023571
------
19:05:02: Uniq: "G"
19:05:02: "G" - diff: 21	 took: 0.025934
------
19:05:02: Uniq: "H"
19:05:02: "H" - diff: 26	 took: 0.020415
------
19:05:02: Uniq: "I"
19:05:02: "I" - diff: 275	 took: 0.042453
------
19:05:02: Uniq: "J"
19:05:02: "J" - diff: 22	 took: 0.017596


In [122]:
scipy.io.savemat('./uniq_images.mat', uniq_data)