# Historical style generator
##### Students – Leor Ariel Rose, Yahav Bar David
##### Academic advisor - Dr. Irina Rabaev

This notebook contains our modern hebrew pre process.

First we will import all necessary libraries



In [None]:
import os
import shutil
import cv2 as cv
import numpy as np
import PIL
from tqdm import tqdm
from typing import List
from google.colab import drive
from google.colab.patches import cv2_imshow
from IPython.display import clear_output

Next lets mount our drive with our data and folders to save

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


Next will define our paths

In [None]:
hebrew_modern_data = "/content/drive/My Drive/final_project/raw_data/modern_hebrew"
hebrew_clean_modern_data_documents = "/content/drive/My Drive/final_project/clean_data/modern_hebrew/documents"
hebrew_clean_modern_data_sentences = "/content/drive/My Drive/final_project/clean_data/modern_hebrew/sentences"

Now we will empty our previous preprocessed documents

In [None]:
# empty old clean data
if os.path.exists(hebrew_clean_modern_data_documents):
    shutil.rmtree(hebrew_clean_modern_data_documents)
    
os.mkdir(hebrew_clean_modern_data_documents)

Next we will pre process modern hebrew data to create document images.
For documents - images need to be rotated, croped, remove from yellow lines and change type to png.

In [None]:
problamatic_files: List[str] = []

# clean each image in modern hebrew folder
for img_file in tqdm(os.listdir(hebrew_modern_data), position=0, leave=True):
  img_name: str
  img_type: str
  img_name, img_type = os.path.splitext(img_file)
                
  # full path of image
  img_full_path: str = hebrew_modern_data + "/" + img_file
                
  # path to save finished processed image
  img_new_full_path: str = hebrew_clean_modern_data_documents + "/" + img_name + '.png'

  # step 1 - open image
  img: np.ndarray = cv.imread(img_full_path)
  
  if img is not None: 
    # step 2 - rotate image
    img = cv.rotate(img, rotateCode = cv.ROTATE_180)

    # step 3 - crop image
    start_x:int = 360
    start_y:int = 1990
    width:int = 4000
    height:int = 2800
    img = img[start_y:start_y + height, start_x:start_x + width]

    # step 4 - remove yellow pixels
      
    # Convert BGR image to HSV
    hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV)

    # define range of yellow color in HSV
    lower_yellow = np.array([20,10,10])
    upper_yellow = np.array([40,255,255])

    # Threshold the HSV image to get only yellow colors
    mask = cv.inRange(hsv, lower_yellow, upper_yellow)

    #background to put instead of yellow
    background = np.full(img.shape, 255, dtype=np.uint8)
    
    # biwise or is performed only in the region of mask, all other values will be set to black in the output
    bk = cv.bitwise_or(background, background, mask=mask)

    # combine foreground+background
    img = cv.bitwise_or(img, bk)

    # step 5 - save image as png
    cv.imwrite(img_new_full_path, cv.cvtColor(img, cv.COLOR_BGR2GRAY))
  
  else:
    problamatic_files.append(img_full_path)


print(f"{len(problamatic_files)} out of {len(os.listdir(hebrew_modern_data))} file couldnt be preprocessed. files are:")
for file_name in problamatic_files:
  print(file_name)

100%|██████████| 466/466 [18:48<00:00,  2.42s/it]

71 out of 466 file couldnt be preprocessed. files are:
/content/drive/My Drive/final_project/raw_data/modern_hebrew/BRN3C2AF4AEB56C_0000000016.tif
/content/drive/My Drive/final_project/raw_data/modern_hebrew/BRN3C2AF4AEB56C_0000000017.tif
/content/drive/My Drive/final_project/raw_data/modern_hebrew/BRN3C2AF4AEB56C_0000000018.tif
/content/drive/My Drive/final_project/raw_data/modern_hebrew/BRN3C2AF4AEB56C_0000000019.tif
/content/drive/My Drive/final_project/raw_data/modern_hebrew/BRN3C2AF4AEB56C_0000000021.tif
/content/drive/My Drive/final_project/raw_data/modern_hebrew/BRN3C2AF4AEB56C_0000000020.tif
/content/drive/My Drive/final_project/raw_data/modern_hebrew/BRN3C2AF4AEB56C_0000000022.tif
/content/drive/My Drive/final_project/raw_data/modern_hebrew/BRN3C2AF4AEB56C_0000000023.tif
/content/drive/My Drive/final_project/raw_data/modern_hebrew/BRN3C2AF4AEB56C_0000000085.tif
/content/drive/My Drive/final_project/raw_data/modern_hebrew/BRN3C2AF4AEB56C_0000000080.tif
/content/drive/My Drive/f




Now we will empty our previous preprocessed documents and sentences

In [None]:
if os.path.exists(hebrew_clean_modern_data_sentences):
    shutil.rmtree(hebrew_clean_modern_data_sentences)
  
os.mkdir(hebrew_clean_modern_data_sentences)

Next we will pre process modern hebrew data to create  sentences images. 
For sentences - images need to be rotated, croped by yellow lines, remove from yellow lines and change type to png. because the start cropped position changes this need to be an input for each image (total of 466 images):

In [None]:
problamatic_files: List[str] = []

# create sentences from each image in modern hebrew folder
for img_file in tqdm(os.listdir(hebrew_modern_data), position=0, leave=True):
  img_name: str
  img_type: str
  img_name, img_type = os.path.splitext(img_file)
                
  # full path of image
  img_full_path: str = hebrew_modern_data + "/" + img_file
                
  # step 1 - open image
  img: np.ndarray = cv.imread(img_full_path)
  
  if img is not None:
    
    # step 2 - rotate image
    img = cv.rotate(img, rotateCode = cv.ROTATE_180)

    # step 3 - crop image
    start_x:int = 360
    start_y:int = 1990
    width:int = 4000
    height:int = 2800
    img = img[start_y:start_y + height, start_x:start_x + width]

    # get cropped image dimensions
    img_height, img_width, _ = img.shape

    # step 4 - remove yellow pixels
    # Convert BGR image to HSV
    hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV)

    # define range of yellow color in HSV
    lower_yellow = np.array([20,10,10])
    upper_yellow = np.array([40,255,255])

    # Threshold the HSV image to get only yellow colors
    mask = cv.inRange(hsv, lower_yellow, upper_yellow)

    #background to put instead of yellow
    background = np.full(img.shape, 255, dtype=np.uint8)
    
    # biwise or is performed only in the region of mask, all other values will be set to black in the output
    bk = cv.bitwise_or(background, background, mask=mask)

    # combine foreground+background
    img = cv.bitwise_or(img, bk)

    # step 5 - crop lines
    try_again: int = 0
    crop_list: List[np.ndarray] = []
    gap: int = 230
    # show image to user to help crop guess start_y position 
    cv2_imshow(img)
    # while user wants to guess a different start_y
    while not try_again:
      # guess start_y
      y_start = int(input("stary_y: "))
      # crop image into sentences by gap and start_y guess
      while y_start + gap < img_height:
        temp_img = img[y_start: y_start + gap, :]
        # test if image contains text
        if np.mean(temp_img) <= 253:
          # add to crop list
          crop_list.append(temp_img)
          # show croped image for user to evaluate
          cv2_imshow(cv.cvtColor(img[y_start: y_start + gap, :], cv.COLOR_BGR2GRAY))
          # seperate croped images
          print("next")
        # continue to next crop
        y_start += gap

      # ask user if he is satisfied or to try again
      try_again = int(input("save (1=yes, 0=try again): "))
      

    # create folder for sentences
    os.mkdir(f"{hebrew_clean_modern_data_sentences}/{img_name}")

    # save images
    for index, temp_img in enumerate(crop_list):
      # save crop image in its folder
      cv.imwrite(f"{hebrew_clean_modern_data_sentences}/{img_name}/{index}.png", cv.cvtColor(temp_img, cv.COLOR_BGR2GRAY))
    
    # clear output for next image
    clear_output(wait=True)
  
  else:
    problamatic_files.append(img_full_path)


print(f"{len(problamatic_files)} out of {len(os.listdir(hebrew_modern_data))} file couldnt be preprocessed. files are:")
for file_name in problamatic_files:
  print(file_name)

100%|██████████| 130/130 [1:04:57<00:00, 29.98s/it]

1 out of 466 file couldnt be preprocessed. files are:
/content/drive/My Drive/final_project/raw_data/modern_hebrew/BRN3C2AF4AEB56C_0000000015.tif



