In [1]:
import os
import numpy as np
import cv2
import time
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
input_dir = "./sd19_merge"
output_dir = "./data/"

image_count = 814255

In [3]:
class_dict = {"i_4a_6a":"J", "i_4b_6b":"K", "i_4c_6c":"L", "i_4d_6d":"M", "i_4e":"N", "i_4f_6f":"O", "i_5a_7a":"Z", "i_6e":"n", "i_30":"0", "i_31":"1", 
              "i_32":"2", "i_33":"3", "i_34":"4", "i_35":"5", "i_36":"6", "i_37":"7", "i_38":"8", "i_39":"9", "i_41":"A", "i_42":"B", "i_43_63":"C", "i_44":"D", 
              "i_45":"E", "i_46":"F", "i_47":"G", "i_48":"H", "i_49_69":"I", "i_50_70":"P", "i_51":"Q", "i_52":"R", "i_53_73":"S", "i_54":"T", "i_55_75":"U", 
              "i_56_76":"V", "i_57_77":"W", "i_58_78":"X", "i_59_79":"Y", "i_61":"a", "i_62":"b", "i_64":"d", "i_65":"e", "i_66":"f", "i_67":"g", "i_68":"h", 
              "i_71":"q", "i_72":"r", "i_74":"t"}

def char_class(str):
    return class_dict[str.split("\\")[1]]

In [4]:
images = np.zeros(shape=(image_count, 32, 32, 1))
im_class = np.empty(shape=(image_count, 1), dtype=str)

def get_array(im_path):
    img = cv2.imread(im_path, cv2.IMREAD_GRAYSCALE)
    # Cut out empty space
    img = img[32:64 + 32, 32:64 + 32]
    # Downscale to 32
    img = cv2.resize(img, (32,32))
    # Invert and normalise
    img = cv2.bitwise_not(img)
    img = img/255
    # Keras requires 4D
    img = np.expand_dims(img, axis=2)
    return img

In [5]:
index_count = 0
ts = time.time()

for root, dirs, files in os.walk(input_dir):
    for file in files:
        try:
            if file.endswith(".png"):
                images[index_count] = get_array(os.path.join(root, file))
                im_class[index_count] = char_class(root)
                index_count += 1
                if index_count % 5000 == 0:
                    print("Completed " + str(index_count) + " of " + str(image_count) + ", time taken: " + str(time.time() - ts) + " seconds")
                    ts = time.time()
        except:
            print("Error occured when processing file " + os.path.join(root, file))
            
print("Completed " + str(index_count) + " of " + str(image_count) + ", time taken: " + str(time.time() - ts) + " seconds")

Completed 5000 of 814255, time taken: 5.937984943389893 seconds
Completed 10000 of 814255, time taken: 6.007254600524902 seconds
Completed 15000 of 814255, time taken: 5.9209418296813965 seconds
Completed 20000 of 814255, time taken: 6.346008777618408 seconds
Completed 25000 of 814255, time taken: 6.583653211593628 seconds
Completed 30000 of 814255, time taken: 5.7525551319122314 seconds
Completed 35000 of 814255, time taken: 5.2165398597717285 seconds
Completed 40000 of 814255, time taken: 5.7761452198028564 seconds
Completed 45000 of 814255, time taken: 5.166958808898926 seconds
Completed 50000 of 814255, time taken: 5.44326376914978 seconds
Completed 55000 of 814255, time taken: 5.843563795089722 seconds
Completed 60000 of 814255, time taken: 5.516719102859497 seconds
Completed 65000 of 814255, time taken: 5.036326169967651 seconds
Completed 70000 of 814255, time taken: 5.579354286193848 seconds
Completed 75000 of 814255, time taken: 5.587031841278076 seconds
Completed 80000 of 8142

In [6]:
ts = time.time()

np.save("data/img_class", im_class)
np.save("data/images", images)

print("Completed, time taken: " + str(time.time() - ts) + " seconds")

Completed, time taken: 19.05471634864807 seconds
