In [1]:
#######
# This script take the IBM Deepsearch parsed JSON and converts it into text.
# We will parse the text at page level and it can be customized to fet full document text as well
# By just changing the full_text+= ("\n" + str(value)[:-2][2:] ) function
# Also, observed that one file was not parsed properly by IBM DeepSearch since it was scanned
# IBM deepsearch only found the picture element from it.
# The file is GDC-submission_German-Youth-IGF
# So implemented tesseract and pdf2image to extract the text.
#####


import json
import os
import shutil
from pdf2image import convert_from_path
#installed poppler using conda install -c conda-forge poppler
from PIL import Image
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'set path to tesseract executable'

from pydantic import BaseModel
class DeepSearchData(BaseModel):
    data: dict
        
def ocr_to_text(pdf_file_path):

    pdfs = pdf_file_path
    pages = convert_from_path(pdfs, 350, poppler_path=r'set path to poppler')
    image_dir="./images"

    if not os.path.exists(image_dir):
        os.makedirs(image_dir)
    
    i = 1
    for page in pages:
        image_name = "Page_" + str(i) + ".jpg"  
        page.save(f"{image_dir}/{image_name}", "JPEG")
        i = i+1

    # Set the path to the directory where you want to save the text files
    text_output_directory = unparsed_doc

    # Create the output directory if it doesn't exist
    os.makedirs(text_output_directory, exist_ok=True)

    for filename in os.listdir(image_dir):
        if filename.endswith(".png") or filename.endswith(".jpg"):
            # Construct the full path to the image
            image_path = os.path.join(image_dir, filename)

            # Perform OCR on the image
            text = pytesseract.image_to_string(Image.open(image_path), lang='eng')

            # Construct the full path to the text file
            text_file_path = os.path.join(text_output_directory, f"{os.path.splitext(filename)[0]}.txt")

            # Write the OCR results to the text file
            with open(text_file_path, "w", encoding="utf-8") as text_file:
                text_file.write(text)
    
    shutil.rmtree(image_dir)


def json_to_plain_text(raw_doc: dict):
    '''
    Converts input json extracted from IBM deepsearch to plain text.
    param: raw_doc - json extracted form IBM deepsearch PDF parsing operation
    '''
    
    pages={}

    for section in raw_doc.data['main-text']:
        d={}
        table_items=[]
        if "$ref" in section:
            c={}
        
            if section["type"] in ["footnote", "page-footer", "page-header", "figure", "table", "equation", "reference"]:
            
                item_number=int(section['$ref'].split('/')[2])
                item_container=section['$ref'].split('/')[1]
                if item_container in ["tables"]:
                    for i in raw_doc.data[item_container][item_number]['cells']['data']:
                        table_items.append(i[-1])
                
                    c.setdefault(item_container, []).append(table_items)
                    pages.setdefault(page_num, []).append(c.copy())
                
                else:
                    container_name=raw_doc.data[item_container][item_number]
                    page_num=container_name['prov'][0]['page']
                    if container_name['type'] in ['picture']:
                        continue
                    else:
                        value=container_name['text']
                        c.setdefault(item_container, []).append(value)
                        pages.setdefault(page_num, []).append(c.copy())
            
        else:
            container_name=section['type']
        
            page_num=section['prov'][0]['page']
            value=section['text']
            d.setdefault(container_name, []).append(value)
            pages.setdefault(page_num, []).append(d)
    
    doc_page=[]
    
    for page in pages:
        full_text=""
        for page_items in pages[page]:
            key, value=list(page_items.items())[0]
            full_text+= ("\n" + str(value)[:-2][2:] )
        doc_page.append(full_text)
    return doc_page


if __name__== "__main__":
    result_dir = "./results/"
    text_dir="./text/"

    if not os.path.exists(text_dir):
        os.makedirs(text_dir)
        
    for file in os.listdir(result_dir):
        if file.endswith(".json"):
            filepath= os.path.join(result_dir, file)
            raw_doc = DeepSearchData(data=json.loads(open(filepath).read()))
            plain_text_pages=json_to_plain_text(raw_doc)
            text_file=file.split(".")[0]
            GDC_text_path=os.path.join(text_dir, text_file)
            # Create the directory if it doesn't exist
            os.makedirs(GDC_text_path, exist_ok=True)
            i=0
            for page in plain_text_pages:
                i+=1
                with open(os.path.join(GDC_text_path, f"Page_{i}.txt"), "w", encoding="utf-8") as t:
                    t.write(page)
    
    for dirs in os.listdir(text_dir):
        if len(os.listdir(os.path.join(text_dir, dirs)))==0:
            unparsed_doc= (os.path.join(text_dir, dirs))
            unparsed_actual_path= f"./data/{dirs}" + ".pdf"
            ocr_to_text(unparsed_actual_path)