In [2]:
from datetime import datetime
import os
import numpy as np
import imageio
import scipy.io

In [17]:
chars = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']

In [16]:
small_folder = './notMNIST_small'
large_folder = './notMNIST_large'

In [134]:
small_orig_mat = './notMNIST_small_orig.mat'
large_orig_mat = './notMNIST_large_orig.mat'
small_uniq_mat = './notMNIST_small_uniq.mat'
large_uniq_mat = './notMNIST_large_uniq.mat'
large_clean_mat = './notMNIST_large_clean.mat'

In [28]:
def folder_to_mat(folder, output_file):
    data = {}
    start = datetime.now()
    for ch in chars:
        char_images = list()
        char_img_path = f'{folder}/{ch}'

        print(f'Reading "{ch}"')

        for r, d, f in os.walk(char_img_path):
            for file in f:
                if '.png' in file:
                    file_path = os.path.join(r, file)
                    try:
                        img = imageio.imread(file_path)
                        char_images.append(img.reshape(-1))
                    except (OSError, ValueError) as e:
                        #print(e)
                        print('Bad file:', file_path)
        data[ch] = np.array(char_images)
    end = datetime.now()
    time_diff = (end - start).total_seconds()
    print(f'{end.strftime("%H:%M:%S")} it took: {time_diff}\'s')
    scipy.io.savemat(output_file, data)

In [29]:
folder_to_mat(small_folder, small_orig_mat)

Reading "A"
Bad file: ./notMNIST_small/A/RGVtb2NyYXRpY2FCb2xkT2xkc3R5bGUgQm9sZC50dGY=.png
Reading "B"
Reading "C"
Reading "D"
Reading "E"
Reading "F"
Bad file: ./notMNIST_small/F/Q3Jvc3NvdmVyIEJvbGRPYmxpcXVlLnR0Zg==.png
Reading "G"
Reading "H"
Reading "I"
Reading "J"
21:15:42 it took: 7.583626's


In [30]:
folder_to_mat(large_folder, large_orig_mat)

Reading "A"
Bad file: ./notMNIST_large/A/RnJlaWdodERpc3BCb29rSXRhbGljLnR0Zg==.png
Bad file: ./notMNIST_large/A/Um9tYW5hIEJvbGQucGZi.png
Bad file: ./notMNIST_large/A/SG90IE11c3RhcmQgQlROIFBvc3Rlci50dGY=.png
Reading "B"
Bad file: ./notMNIST_large/B/TmlraXNFRi1TZW1pQm9sZEl0YWxpYy5vdGY=.png
Reading "C"
Reading "D"
Bad file: ./notMNIST_large/D/VHJhbnNpdCBCb2xkLnR0Zg==.png
Reading "E"
Reading "F"
Reading "G"
Reading "H"
Reading "I"
Reading "J"
21:22:58 it took: 395.27648's


In [51]:
def remove_duplicates(data):
    uniq_data = {}
    total_start = datetime.now()
    for ch in chars:
        start = datetime.now()
        print('------')
        print(f'{start.strftime("%H:%M:%S")}: Uniq: "{ch}"')
        uniq_data[ch] = np.unique(data[ch], axis=0)
        end = datetime.now()
        time_diff = (end - start).total_seconds()
        print(f'{end.strftime("%H:%M:%S")}: "{ch}" - diff: {len(data[ch]) - len(uniq_data[ch])}\t took: {time_diff}')

    total_end = datetime.now()
    total_time_diff = (total_end - total_start).total_seconds()
    print(f'\n{total_end.strftime("%H:%M:%S")} it took: {total_time_diff}\'s')
    return uniq_data

In [52]:
small_orig_data = scipy.io.loadmat(small_orig_mat)

In [53]:
small_uniq_data = remove_duplicates(small_orig_data)

------
21:35:22: Uniq: "A"
21:35:22: "A" - diff: 24	 took: 0.015762
------
21:35:22: Uniq: "B"
21:35:22: "B" - diff: 20	 took: 0.012835
------
21:35:22: Uniq: "C"
21:35:22: "C" - diff: 24	 took: 0.01398
------
21:35:22: Uniq: "D"
21:35:22: "D" - diff: 26	 took: 0.01098
------
21:35:22: Uniq: "E"
21:35:22: "E" - diff: 26	 took: 0.014779
------
21:35:22: Uniq: "F"
21:35:22: "F" - diff: 22	 took: 0.013029
------
21:35:22: Uniq: "G"
21:35:22: "G" - diff: 21	 took: 0.011523
------
21:35:22: Uniq: "H"
21:35:22: "H" - diff: 26	 took: 0.010218
------
21:35:22: Uniq: "I"
21:35:22: "I" - diff: 275	 took: 0.043348
------
21:35:22: Uniq: "J"
21:35:22: "J" - diff: 22	 took: 0.015949

21:35:22 it took: 0.163384's


In [54]:
scipy.io.savemat(small_uniq_mat, small_uniq_data)

In [55]:
large_orig_data = scipy.io.loadmat(large_orig_mat)

In [56]:
large_uniq_data = remove_duplicates(large_orig_data)

------
21:39:22: Uniq: "A"
21:39:22: "A" - diff: 5807	 took: 0.795365
------
21:39:22: Uniq: "B"
21:39:23: "B" - diff: 5630	 took: 0.558981
------
21:39:23: Uniq: "C"
21:39:24: "C" - diff: 6258	 took: 0.618774
------
21:39:24: Uniq: "D"
21:39:24: "D" - diff: 6180	 took: 0.569035
------
21:39:24: Uniq: "E"
21:39:25: "E" - diff: 5958	 took: 0.577547
------
21:39:25: Uniq: "F"
21:39:25: "F" - diff: 6068	 took: 0.579709
------
21:39:25: Uniq: "G"
21:39:26: "G" - diff: 5822	 took: 0.604438
------
21:39:26: Uniq: "H"
21:39:27: "H" - diff: 6730	 took: 0.641705
------
21:39:27: Uniq: "I"
21:39:28: "I" - diff: 11739	 took: 1.915712
------
21:39:28: Uniq: "J"
21:39:30: "J" - diff: 6253	 took: 1.193399

21:39:30 it took: 8.056471's


In [57]:
scipy.io.savemat(large_uniq_mat, large_uniq_data)

In [123]:
def setdiff2d(a_arr, b_arr):
    a_rows = a_arr.view([('', a_arr.dtype)] * a_arr.shape[1])
    b_rows = b_arr.view([('', b_arr.dtype)] * b_arr.shape[1])
    return np.setdiff1d(a_rows, b_rows, assume_unique=True).view(a_arr.dtype).reshape(-1, a_arr.shape[1])

In [131]:
def remove_intersections(remove_from, source):
    clean_data = {}
    total_start = datetime.now()
    for ch in chars:
        start = datetime.now()
        print('------')
        print(f'{start.strftime("%H:%M:%S")}: Uniq: "{ch}"')
        clean_data[ch] = setdiff2d(remove_from[ch], source[ch])
        end = datetime.now()
        time_diff = (end - start).total_seconds()
        print(f'{end.strftime("%H:%M:%S")}: "{ch}" - diff: {len(remove_from[ch]) - len(clean_data[ch])}\t took: {time_diff}')

    total_end = datetime.now()
    total_time_diff = (total_end - total_start).total_seconds()
    print(f'\n{total_end.strftime("%H:%M:%S")} it took: {total_time_diff}\'s')
    return clean_data

In [132]:
large_clean_data = remove_intersections(large_uniq_data, small_uniq_data)

------
23:01:41: Uniq: "A"
23:01:43: "A" - diff: 450	 took: 1.252596
------
23:01:43: Uniq: "B"
23:01:44: "B" - diff: 429	 took: 1.290194
------
23:01:44: Uniq: "C"
23:01:45: "C" - diff: 469	 took: 1.186779
------
23:01:45: Uniq: "D"
23:01:47: "D" - diff: 467	 took: 1.322485
------
23:01:47: Uniq: "E"
23:01:48: "E" - diff: 457	 took: 1.415042
------
23:01:48: Uniq: "F"
23:01:49: "F" - diff: 461	 took: 1.234455
------
23:01:49: Uniq: "G"
23:01:50: "G" - diff: 444	 took: 1.269038
------
23:01:50: Uniq: "H"
23:01:51: "H" - diff: 505	 took: 1.023126
------
23:01:51: Uniq: "I"
23:01:52: "I" - diff: 441	 took: 0.945542
------
23:01:52: Uniq: "J"
23:01:54: "J" - diff: 465	 took: 1.23689

23:01:54 it took: 12.17776's


In [135]:
scipy.io.savemat(large_clean_mat, large_clean_data)