# Import libraries

In [4]:
import tesserocr
import img2pdf
import cv2
import os
import numpy as np
from PIL import Image
from pdf2image import convert_from_path
from tesserocr import PyTessBaseAPI, RIL, PSM, OEM # to call different API functions of tesseract

In [5]:
# Import Pre-processing functions

import sys
sys.path.insert(0, './../Pre-Processing')

from pdf_parser_engine import get_text
from whitespace_margin_remover import get_roi
from skew_correction_engine import rotate_image
from adaptive_binarisation_engine import get_binarised_image
from box_detection_engine import get_boxes

# Load the File

In [3]:
# GIVE FILE PATH AND FILE NAME

file_parent_dir = "./../../res/data/AICST-dataset/MF/test_files/"
file_name = "117438-4312_327220171005155635-Page(1).pdf"
#file_name = "117540-232-67215820_160620171011155112-Page(1).pdf"
#file_name = "117568-MAWB014-45783043_333320171013030906-Page(1).pdf"
#file_name = "117586-[Untitled]_303520171013155622-Page(2).pdf"
#file_name = "118354-JFK4825_303820171123032706-Page(2).pdf"
file_path = os.path.join(os.path.abspath(file_parent_dir), file_name)


#  Check File Type - PDF or any other type

In [4]:
# TO GET FILE EXTENSION --> SPLIT

file_name_witout_ext = os.path.splitext(file_name)[0]
file_ext = os.path.splitext(file_name)[1]

#  Check PDF Type - Searchable (OCR not required) OR Non-Searchable (OCR required)

In [5]:
# CHECK IF SEARCHABLE PDF OR NOT

#print(file_path)
do_ocr = True 
if file_ext == '.pdf':
    pdfOutput = get_text(file_path)
    if pdfOutput != "b''":
        print(pdfOutput)
        do_ocr = True
    else:
        print("Non Searchable PDF!")

# For now, OCR engine runs on all PDFs irresepective of whether it is searchable or not

Non Searchable PDF!


# Pre-Processing

In [6]:
if do_ocr:

    dataset_file_name = ''
    num_of_pages = 1
    num_of_blobs = 1

    dataset_file_name = file_parent_dir + file_name_witout_ext

    if not os.path.exists(dataset_file_name):
        try:
            os.makedirs(dataset_file_name)
        except OSError as exc: 
            print("Can not create directory in dataset directory!")

    if file_ext != '.pdf':
        with open(dataset_file_name + '.pdf', 'wb') as f:
            print(file_path)
            f.write(img2pdf.convert(file_path))

    file_path = file_parent_dir + file_name_witout_ext + '.pdf'

    pages = convert_from_path(file_path, 500)
    page_path = ''
    page_name_dir = dataset_file_name + '/page'+ str(num_of_pages)
    
    if not os.path.exists(page_name_dir):
        try:
            os.makedirs(page_name_dir)
        except OSError as exc: 
            print("Can not create directory in page name directory!")

    for page in pages:
        
        page.save(page_name_dir + '/page'+ str(num_of_pages) + '.jpg', 'JPEG')
        
        page_image = cv2.imread(page_name_dir + '/page'+ str(num_of_pages) + '.jpg',cv2.IMREAD_GRAYSCALE)
        
        page_path = dataset_file_name + '/page' + str(num_of_pages) + '_blobs/'
        
        if not os.path.exists(page_path):
            try:
                os.makedirs(page_path)
            except OSError as exc: 
                print("Can not create result directory!")
                
        num_of_blobs = get_boxes(page_name_dir + '/page'+ str(num_of_pages) + '.jpg', page_path, True)
        
        num_of_pages = num_of_pages+1


In [7]:
print(num_of_blobs)

34


# Optical Character Recognition (OCR) using Tesseract engine

In [8]:
for page_num in range(1,num_of_pages):
    
    ocr_results_path = dataset_file_name + '/page' + str(page_num) + '_blobs_ocr/' 
    
    if not os.path.exists(ocr_results_path):
        try:
            os.makedirs(ocr_results_path)
        except OSError as exc: 
            print("Can not create ocr results directory!")
        
    for blob_num in range(1,num_of_blobs+1):
        
        blob_img_path = dataset_file_name + '/page' + str(page_num) + '_blobs/blob_' + str(blob_num) + '.jpg'
        
        text = tesserocr.file_to_text(blob_img_path)
        
        f= open(ocr_results_path + 'blob_' + str(blob_num) + '_ocr.txt', 'w+', encoding = 'utf8')
        f.write(text)
        f.close()

# Page Segmentation using Tesseract engine

In [9]:
page_api = PyTessBaseAPI()
#   pagesegmode values are:
#   0 = Orientation and script detection (OSD) only.
#   1 = Automatic page segmentation with OSD.
#   2 = Automatic page segmentation, but no OSD, or OCR
#   3 = Fully automatic page segmentation, but no OSD. (Default)
#   4 = Assume a single column of text of variable sizes.
#   5 = Assume a single uniform block of vertically aligned text.
#   6 = Assume a single uniform block of text.
#   7 = Treat the image as a single text line.
#   8 = Treat the image as a single word.
#   9 = Treat the image as a single word in a circle.
#   10 = Treat the image as a single character.

for page_num in range(1,num_of_pages):
    
    for blob_num in range(1,num_of_blobs+1):
        
        blob_img_path = dataset_file_name + '/page' + str(page_num) + '_blobs/blob_' + str(blob_num) + '.jpg'
        
        #block_dir_path = dataset_file_name + '/page' + str(page_num) + '_blobs/blob_' + str(blob_num) +'_segments/'+ str(blob_num) +'/'
        block_dir_path = dataset_file_name + '/page' + str(page_num) + '_blobs/blob_segments/blob_' + str(blob_num) +'/'
        
        if not os.path.exists(block_dir_path):
            try:
                os.makedirs(block_dir_path)
            except OSError as exc: 
                print("Can not create blob segment directory!")
        
        #get_boxes(blob_img_path, block_dir_path, False)
        
        page_api.SetImageFile(blob_img_path)
        
        blocks = page_api.GetComponentImages(RIL.TEXTLINE, True)
        
        blob_image = cv2.imread(blob_img_path, cv2.IMREAD_GRAYSCALE)
        
        blob_image_draw = blob_image               
                
        for i, (im, block, _, _) in enumerate(blocks):
            block_x, block_y, block_w, block_h = block['x'], block['y'], block['w'], block['h']
            block_image = blob_image[block_y:block_y+block_h, block_x:block_x+block_w]
            cv2.imwrite(block_dir_path + 'segment_' + str(i) + '.jpg', block_image)


In [11]:
# Page Segmentation test
image = Image.open('./../../res/data/AICST-dataset/MF/test_files/118354-JFK4825_303820171123032706-Page(2)/page1/page1.jpg')
with PyTessBaseAPI() as api:
    api.SetImage(image)
    boxes = api.GetComponentImages(RIL.TEXTLINE, True)
    print("Found " + str(len(boxes)) + " textline image components.")
    for i, (im, box, _, _) in enumerate(boxes):
        # im is a PIL image object
        # box is a dict with x, y, w and h keys
        api.SetRectangle(box['x'], box['y'], box['w'], box['h'])
        ocrResult = api.GetUTF8Text()
        conf = api.MeanTextConf()
        print("Box[" + str(i) + "]: x=" + str(box['x']) + ", y=" + str(box['y']) + ", w=" + str(box['w']) + ", h=" + str(box['h']) + ", confidence: " + str(conf) + "\n text: " + ocrResult)
image.close()

Found 60 textline image components.
Box[0]: x=377, y=14, w=3645, h=178, confidence: 79
 text: 724|JFK| |||||||||||||||||||||||||||||||

Box[1]: x=400, y=228, w=2040, h=55, confidence: 83
 text: Shipper's Name and Address Shipper's Account Number Not Negotiable

Box[2]: x=376, y=281, w=2206, h=117, confidence: 88
 text: JFK4825 Air Waybill

 

Box[3]: x=374, y=459, w=2269, h=63, confidence: 88
 text: Concordia International Forwarding Corp. ||ssuedby SWISS

Box[4]: x=378, y=547, w=791, h=49, confidence: 90
 text: 155-37 145th AVENUE

Box[5]: x=376, y=635, w=3117, h=60, confidence: 95
 text:  

Box[6]: x=2198, y=686, w=1342, h=41, confidence: 85
 text: Copies 1,2 and 3 of this Air Waybill are originals and have the same validy

Box[7]: x=400, y=737, w=3592, h=63, confidence: 87
 text: Consignee's Name and Address Consignee's AceountNumber It is agreed that the goods described herein are accepted in apparent good order and condition

Box[8]: x=2174, y=810, w=1815, h=41, confidence: 88
 te