# Training/validation dataset preprocessing

For training the CNN model, three datasets were combined:

1. MNIST dataset (contains 20x20 images which are numbers from 0 to 9)
2. Kaggle Handwritten math symbol and digit dataset: Used for all the symbols, numbers were also merged with MNIST dataset numbers (https://www.kaggle.com/clarencezhao/handwritten-math-symbol-dataset)
3. Because there was only a small number of forward slashes (division symbols) in the second dataset, the "backslashes" from this dataset were taken, mirrored and processed to act like forward slashed - but still there are only ~430 forward slashes in the entire dataset! (https://www.kaggle.com/guru001/hasyv2)

The code below was used for processing and merging input datasets into one. It results in a MNIST dataset format (20x20 size, white characters and black background)

In [2]:
import cv2
import os
import numpy as np
working_dir = '/home/cec/Downloads/handwritten_math_symbols_kaggle/extracted_images'
working_dir2 = '/home/cec/Documents/photomath'

In [23]:
for image_path in os.listdir(working_dir2 + '/backslash'):
    if '.png' not in image_path:
        continue
    img = cv2.imread(working_dir2 + '/backslash/' + image_path, 0)
    # Flip backslash horizontally so it becomes a forward slash
    img = cv2.flip(img, 1)
    cv2.imwrite(working_dir2 + '/backslash/' + image_path, img)
    #cv2.imshow('img', img)
    #cv2.waitKey(0)
    #cv2.destroyAllWindows()

In [24]:
# Image preprocessing

def resizeAndPad(img, size, padColor=0):

    h, w = img.shape[:2]
    sh, sw = size

    # interpolation method
    if h > sh or w > sw: # shrinking image
        interp = cv2.INTER_AREA
    else: # stretching image
        interp = cv2.INTER_CUBIC

    # aspect ratio of image
    aspect = w/h  # if on Python 2, you might need to cast as a float: float(w)/h

    # compute scaling and pad sizing
    if aspect > 1: # horizontal image
        new_w = sw
        new_h = np.round(new_w/aspect).astype(int)
        pad_vert = (sh-new_h)/2
        pad_top, pad_bot = np.floor(pad_vert).astype(int), np.ceil(pad_vert).astype(int)
        pad_left, pad_right = 0, 0
    elif aspect < 1: # vertical image
        new_h = sh
        new_w = np.round(new_h*aspect).astype(int)
        pad_horz = (sw-new_w)/2
        pad_left, pad_right = np.floor(pad_horz).astype(int), np.ceil(pad_horz).astype(int)
        pad_top, pad_bot = 0, 0
    else: # square image
        new_h, new_w = sh, sw
        pad_left, pad_right, pad_top, pad_bot = 0, 0, 0, 0

    # set pad color
    if len(img.shape) is 3 and not isinstance(padColor, (list, tuple, np.ndarray)): # color image but only one color provided
        padColor = [padColor]*3

    # scale and pad
    scaled_img = cv2.resize(img, (new_w, new_h), interpolation=interp)
    scaled_img = cv2.copyMakeBorder(scaled_img, pad_top, pad_bot, pad_left, pad_right, borderType=cv2.BORDER_CONSTANT, value=padColor)

    return scaled_img

# Symbols from Kaggle Handwritten Mathematical Symbols Dataset (45x45) need to 
# match MNIST dataset digits (28x28)
working_dir = working_dir2
for filename in os.listdir(working_dir):
    #if filename in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
    if filename in ['backslash']:
        for image_path in os.listdir(working_dir + '/' + filename):
            if '.jpg' not in image_path and '.png' not in image_path:
                continue
            # Read image in grayscale
            image = cv2.imread(working_dir + '/' + filename + '/' + image_path, 0)

            # Invert black and white colors
            image = cv2.bitwise_not(image)
            
            # Make image binary
            image = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY)[1]
            
            # Kernel for erosion and dilation
            kernel = np.ones((5,5),np.uint8)
            
            # Dilating images for symbols to be thicker
            # because it seems that MNIST dataset has thicker
            # numbers than are symbols in the Kaggle dataset
            # So we are going to try to make these symbols
            # be thicker to match the MNIST dataset "thickness"
            image = cv2.dilate(image,kernel,iterations = 1)
            
            # Resize to 20x20 preserving aspect ratio
            image = resizeAndPad(image, (20, 20))
            
            # Pad the image so it ends up being 28x28
            # (just like MNIST dataset images are)
            image = cv2.copyMakeBorder(image, 4, 4, 4, 4, borderType=cv2.BORDER_CONSTANT)
            
            #cv2.imshow('img', image)
            #cv2.waitKey(0)
            #cv2.destroyAllWindows()   
            
            # Save the modified image
            image_path_splitted = image_path.split('.')
            if not os.path.isdir('/home/cec/Documents/photomath/' + filename + '_converted'):
                os.mkdir('/home/cec/Documents/photomath/' + filename + '_converted')
            cv2.imwrite('/home/cec/Documents/photomath' + '/' + filename + '_converted/' + image_path_splitted[0] + '_mnist' + '.' + image_path_splitted[1], image)


  if len(img.shape) is 3 and not isinstance(padColor, (list, tuple, np.ndarray)): # color image but only one color provided
