# Import libraries

In [1]:
import tesserocr
import tempfile
import img2pdf
import cv2
import os
import numpy as np
from PIL import Image
from pdf2image import convert_from_path
from tesserocr import PyTessBaseAPI, RIL, PSM, OEM # to call different API functions of tesseract

In [2]:
# Import Pre-processing functions

import sys
sys.path.insert(0, '../Pre-Processing')

from pdf_parser_engine import get_text
from whitespace_margin_remover import get_roi
from skew_correction_engine import rotate_image
from adaptive_binarisation_engine import get_binarised_image
from box_detection_engine import get_boxes

# Load the File

In [3]:
# GIVE FILE PATH AND FILE NAME

file_parent_dir = "./../../res/data/AICST-dataset/MF/test_files/"
#file_name = "117438-4312_327220171005155635-Page(1).pdf"
#file_name = "117540-232-67215820_160620171011155112-Page(1).pdf"
#file_name = "117568-MAWB014-45783043_333320171013030906-Page(1).pdf"
#file_name = "117586-[Untitled]_303520171013155622-Page(2).pdf"
file_name = "118354-JFK4825_303820171123032706-Page(2).pdf"
file_path = os.path.join(os.path.abspath(file_parent_dir), file_name)


#  Check File Type - PDF or any other type

In [4]:
# TO GET FILE EXTENSION --> SPLIT

file_name_witout_ext = os.path.splitext(file_name)[0]
file_ext = os.path.splitext(file_name)[1]

#  Check PDF Type - Searchable (OCR not required) OR Non-Searchable (OCR required)

In [5]:
# CHECK IF SEARCHABLE PDF OR NOT

#print(file_path)
do_ocr = True 
if file_ext == '.pdf':
    pdfOutput = get_text(file_path)
    if pdfOutput != "b''":
        print(pdfOutput)
        do_ocr = True
    else:
        print("Non Searchable PDF!")

# For now, OCR engine runs on all PDFs irresepective of whether it is searchable or not

b'Other Charges\nRequested Flight/Date\nTax\nTotal Collect\nNot Negotiable\nIssuing Carrier\'s Agent Name and City\nDeclared Value for Customs\nDeclared Value for Carriage\nCharge\nValuation Charge\nExecuted on (date)\nINSURANCE - If carrier offers insurance and such insurance is\nrequested in accordance with the conditions thereof, indicate amount\nto be insured in figures in box marked "Amount of Insutance".\nTotal Other Charges Due Agent\nPrepaid\nCC Charges in Dest. Currency\nCurrency Conversion Basis\nTotal Prepaid\nCollect\nSignature of Shipper or its Agent\nat (place)\nTotal Other Charges Due Carrier\nOriginal Shippers Information\nConsignee\'s Account Number\nAir Waybill\nCopies 1,2 and 3 of this Air Waybill are originals and have the same validy\nShipper\'s Name and Address\nShipper\'s Account Number\nIssued by\nConsignee\'s Name and Address\nAgent\'s IATA Code\nAccount No.\nAirport of Departure (Addr of First Carrier) and Requesting Routing\nReference Number\nCurrency\nWT/VAL

# Pre-Processing

In [6]:
if do_ocr:

    dataset_file_name = ''
    num_of_pages = 1
    num_of_blobs = 1

    dataset_file_name = file_parent_dir + file_name_witout_ext

    if not os.path.exists(dataset_file_name):
        try:
            os.makedirs(dataset_file_name)
        except OSError as exc: 
            print("Can not create directory in dataset directory!")

    if file_ext != '.pdf':
        with open(dataset_file_name + '.pdf', 'wb') as f:
            print(file_path)
            f.write(img2pdf.convert(file_path))

    file_path = file_parent_dir + file_name_witout_ext + '.pdf'

    pages = convert_from_path(file_path, 500)
    page_path = ''
    page_name_dir = dataset_file_name + '/page'+ str(num_of_pages)
    
    if not os.path.exists(page_name_dir):
        try:
            os.makedirs(page_name_dir)
        except OSError as exc: 
            print("Can not create directory in page name directory!")

    for page in pages:
        
        page.save(page_name_dir + '/page'+ str(num_of_pages) + '.jpg', 'JPEG')
        
        page_image = cv2.imread(page_name_dir + '/page'+ str(num_of_pages) + '.jpg',cv2.IMREAD_GRAYSCALE)
        
        page_path = dataset_file_name + '/page' + str(num_of_pages) + '_blobs/'
        
        if not os.path.exists(page_path):
            try:
                os.makedirs(page_path)
            except OSError as exc: 
                print("Can not create result directory!")
                
        num_of_blobs = get_boxes(page_name_dir + '/page'+ str(num_of_pages) + '.jpg', page_path, True)
        
        num_of_pages = num_of_pages+1


In [7]:
print(num_of_blobs)

39


# Optical Character Recognition (OCR) using Tesseract engine

In [8]:
for page_num in range(1,num_of_pages):
    
    ocr_results_path = dataset_file_name + '/page' + str(page_num) + '_blobs_ocr/' 
    
    if not os.path.exists(ocr_results_path):
        try:
            os.makedirs(ocr_results_path)
        except OSError as exc: 
            print("Can not create ocr results directory!")
        
    for blob_num in range(1,num_of_blobs+1):
        
        blob_img_path = dataset_file_name + '/page' + str(page_num) + '_blobs/blob_' + str(blob_num) + '.jpg'
        
        text = tesserocr.file_to_text(blob_img_path)
        
        f= open(ocr_results_path + 'blob_' + str(blob_num) + '_ocr.txt', 'w+', encoding = 'utf8')
        f.write(text)
        f.close()

# Page Segmentation

In [9]:
page_api = PyTessBaseAPI()

for page_num in range(1,num_of_pages):
    
    for blob_num in range(1,num_of_blobs+1):
        
        blob_img_path = dataset_file_name + '/page' + str(page_num) + '_blobs/blob_' + str(blob_num) + '.jpg'
        
        #block_dir_path = dataset_file_name + '/page' + str(page_num) + '_blobs/blob_' + str(blob_num) +'_segments/'+ str(blob_num) +'/'
        block_dir_path = dataset_file_name + '/page' + str(page_num) + '_blobs/blob_segments/blob_' + str(blob_num) +'/'
        
        if not os.path.exists(block_dir_path):
            try:
                os.makedirs(block_dir_path)
            except OSError as exc: 
                print("Can not create blob segment directory!")
        
        #get_boxes(blob_img_path, block_dir_path, False)
        
        page_api.SetImageFile(blob_img_path)
        
        blocks = page_api.GetComponentImages(RIL.TEXTLINE, True)
        
        blob_image = cv2.imread(blob_img_path, cv2.IMREAD_GRAYSCALE)
        
        blob_image_draw = blob_image               
                
        for i, (im, block, _, _) in enumerate(blocks):
            block_x, block_y, block_w, block_h = block['x'], block['y'], block['w'], block['h']
            #block_image = blob_image[block_y-5:block_y+block_h+5, block_x-5:block_x+block_w+5]
            block_image = blob_image[block_y:block_y+block_h, block_x:block_x+block_w]
            cv2.imwrite(block_dir_path + 'segment_' + str(i) + '.jpg', block_image)
