This notebook is to pre process the datasets we have. all data is in the form of images(.tiff, .jpeg, .png, .pdf and more). 
This is not part of our service or end result and after we have clean data we will not need it anymore because of that its a jupyter notebook for simplicity.

First we import all necessary modules:

In [25]:
import os
import shutil
import cv2 as cv
import numpy as np
import tqdm
from typing import List
from PIL import Image, UnidentifiedImageError

Next we will set global paths to each folder of data we need to pre process:

In [26]:
preProcessFolder = os.getcwd()

arabHistorical = preProcessFolder + '/raw-data/arab'
arabModern = preProcessFolder + '/raw-data/modernArab'
englishModern = preProcessFolder + '/raw-data/modernEnglish'
hebrewCairoGeniza = preProcessFolder + '/raw-data/hebrewCairoGeniza'
hebrewMiddleAges = preProcessFolder + '/raw-data/hebrewMiddleAges'
hebrewModern = preProcessFolder + '/raw-data/modernHebrew'
latinHistorical = preProcessFolder + '/raw-data/latin'

Next we declare few helper functions:

In [27]:
def cropImage(img:Image.Image, start_x:int, start_y:int, width:int, height:int) -> Image.Image:
    """
    Method to crop image and save cropped image

    Args:
        img (Image.Image): the image
        start_x (int): where to start crop by x axis - must be positive
        start_y (int): where to start crop by y axis - must be positive
        width (int): width of crop from start_x - must be positive
        height (int): height of crop from start_y - must be positive

    Raises:
        ValueError: if one of the parameters are negative or image crop is not in image range
    """
    
    # test image crop is in image range
    imgWidth, imgHeight = img.size

    if start_x < 0 or start_y < 0 or width < 0  or height < 0:
        raise ValueError("start_x, start_y, width, height must be positive")
    if start_x + width > imgWidth:
        raise ValueError(f"{start_x + width} is bigger than image width")
    if start_y + height > imgHeight:
        raise ValueError(f"{start_y + height} is bigger than image height")

    # crop image
    return img.crop((start_x, start_y, start_x + width, start_y + height))

In [28]:
def rotateImage(img:Image.Image, angle:int) -> Image.Image:
    """
    Method to rotate image and save rotateed image

    Args:
        img (Image.Image): the image
        angle (int): the angle to rotate - must be positive

    Raises:
        ValueError: if angle is negative
    """        
    # test angel is positive
    if angle < 0:
        raise ValueError("angel must be positive")
        
    # rotate
    return img.rotate(angle)

In [29]:
def saveImageAsType(imgPath:str, img:Image.Image, newType:str) -> None:
    """
    Method to save image with newType

    Args:
        imgPath (str): the path to save image
        img (Image.Image): the image
        newType (str): the new type - should be without '.' for example JPEG

    Raises:
        OSError: If the file could not be written. The file may have been created, and may contain partial data.
        ValueError: If the output format could not be determined from the file name. Use the format option to solve this.
        FileNotFoundError: [description]
    """

    # convert image to new type
    imgName:str = ''
    imgType:str = ''
    imgName, imgType = os.path.splitext(imgPath)
    img.thumbnail(img.size)
    img.save(imgName + '.' + newType, newType, quality=100)

In [30]:
def replaceColorRangeInImage(imgPath:str, lowerLimit:List[int], upperLimit:List[int], newColor:List[int]) -> Image.Image:
    """
    Method to swap HSV color range in image to new HSV color works on same image

    Args:
        imgPath (str): the path to image
        lowerLimit (List[int]): an HSV list to represent for lower limit of color to swap for example [21, 39, 64]
        upperLimit (List[int]): an HSV list to represent for upper limit of color to swap for example [40, 255, 255]
        newColor (List[int]): an HSV list to represent the new color to swap to for example [255, 255, 255]
    """

    # transform BGR image to HSV
    image = cv.imread(imgPath) 
    hsv=cv.cvtColor(image, cv.COLOR_BGR2HSV)
        
    # find witch pixels to replace in image
    mask = cv.inRange(hsv, np.array(lowerLimit), np.array(upperLimit))
    image[mask>0] = newColor

    # replace wanted color range to new color
    cv.imwrite(imgPath, image)

Next we will create a folder tree for our cleaned data:

In [31]:
# empty old clean data
if os.path.exists("clean-data"):
    shutil.rmtree("clean-data")

# create clean data folders
os.mkdir(preProcessFolder + "/clean-data")
os.mkdir(preProcessFolder + "/clean-data/arab")
os.mkdir(preProcessFolder + "/clean-data/modern-arab")
os.mkdir(preProcessFolder + "/clean-data/modern-english")
os.mkdir(preProcessFolder + "/clean-data/hebrew-cairo-geniza")
os.mkdir(preProcessFolder + "/clean-data/hebrew-middle-ages")
os.mkdir(preProcessFolder + "/clean-data/modern-hebrew")
os.mkdir(preProcessFolder + "/clean-data/latin")

Lets start to preprocess with modern english data. Images need to be croped:

In [32]:
# clean each image in modern english folder
for imgFile in tqdm(os.listdir(englishModern)):
    imgName:str
    imgType:str
    imgName, imgType = os.path.splitext(imgFile)
                
    # full path of image
    imgFullPath:str = englishModern + "/" + imgFile
                
    # path to save finished processed image
    imgNewFullPath:str = preProcessFolder + "/clean-data/modern-english/" + imgName + '.png'

    # open image
    img: Image.Image = Image.open(imgFullPath)

    # crop image
    img = cropImage(img, 250, 700, 2220, 2100)

    # save image as png
    saveImageAsType(imgNewFullPath, img, 'png')

NameError: name 'tqdm' is not defined

Next we will pre process modern hebrew data.
Images need to be rotated, croped, remove from yellow lines and change type to png:


In [33]:
# clean each image in modern hebrew folder
for imgFile in tqdm(os.listdir(hebrewModern)):
    imgName:str
    imgType:str
    imgName, imgType = os.path.splitext(imgFile)
                
    # full path of image
    imgFullPath:str = hebrewModern + "/" + imgFile
                
    # path to save finished processed image
    imgNewFullPath:str = preProcessFolder + "/clean-data/modern-hebrew/" + imgName + '.png'

    # open image
    img: Image.Image = Image.open(imgFullPath)
        
    # rotate image
    img = rotateImage(img, 180)

    # crop image
    img = cropImage(img, 360, 1990, 4000, 2800)

    # save image as png
    saveImageAsType(imgNewFullPath, img, 'png')

    # remove yellow pixels save in pre processed folder
    replaceColorRangeInImage(imgNewFullPath, [21, 39, 64], [40, 255, 255], [255,255,255])


NameError: name 'tqdm' is not defined

Next we will pre process modern arac data.
Images need a change of type to png:

In [34]:
# clean each image in modern arab folder
for imgFile in tqdm(os.listdir(arabModern)):
    imgName:str
    imgType:str
    imgName, imgType = os.path.splitext(imgFile)
                
    # full path of image
    imgFullPath:str = arabModern + "/" + imgFile
                
    # path to save finished processed image
    imgNewFullPath:str = preProcessFolder + "/clean-data/modern-arab/" + imgName + '.png'

    # open image
    img: Image.Image = Image.open(imgFullPath)
        
    # save image as png
    saveImageAsType(imgNewFullPath, img, 'png')

    # remove yellow pixels save in pre processed folder
    replaceColorRangeInImage(imgNewFullPath, [21, 39, 64], [40, 255, 255], [255,255,255])

NameError: name 'tqdm' is not defined