In [20]:
import os
import pickle
import pandas as pd
from tqdm import tqdm

from doctr.models import ocr_predictor
from doctr.io import DocumentFile

In [22]:
OCR_DATA_PATH  = './../../data/ocr/docbank/images/'
TXT_DATA_PATH  = './../../results/ocr/linknet_master/'
image_data_dir = os.listdir(OCR_DATA_PATH)

# model = ocr_predictor(det_arch='linknet_resnet18', reco_arch='master', pretrained=False)
model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)

In [43]:
with open('./../../results/ocr/dbnet_vgg16/results.pkl', 'rb') as f:
    results = pickle.load(f)

In [41]:
results = {}

for image_file in tqdm(image_data_dir):
    doc = DocumentFile.from_images(OCR_DATA_PATH + image_file)
    result = model(doc)
    results[image_file] = result
#     break
    
# with open(TXT_DATA_PATH + 'results.pkl', 'wb') as outp:  # Overwrites any existing file.
#     pickle.dump(results, outp, pickle.HIGHEST_PROTOCOL)

  9%|████████████▍                                                                                                                              | 9/101 [00:57<09:43,  6.34s/it]


KeyboardInterrupt: 

In [50]:
sentences = []
for image,result in tqdm(results.items()):
    dim = tuple(reversed(result.pages[0].dimensions))
    predictions = []
    block_id = 0
    for block in result.pages[0].blocks:
        sentence = []
        line_id = 0
        for line in block.lines: 
            for word in line.words:
                sentence.append(word.value)
        sentences.append(' '.join(sentence))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 3510.14it/s]


In [56]:
sentences[136]

'Yukawa and Scalar Theories'

In [53]:
import xml.etree.ElementTree as ET

def doctr_to_hocr(doc, image_path, page_num=1):
    # Create an XML element for the HTML page
    page = ET.Element('html')
    page.set('xmlns', 'http://www.w3.org/1999/xhtml')

    # Create an XML element for the page body
    body = ET.SubElement(page, 'body')

    # Add the OCR results to the page body
    for block in doc.blocks:
        block_elem = ET.SubElement(body, 'div')
        block_elem.set('class', 'ocr_carea')
        for line in block.lines:
            line_elem = ET.SubElement(block_elem, 'span')
            line_elem.set('class', 'ocr_line')
            for word in line.words:
                word_elem = ET.SubElement(line_elem, 'span')
                word_elem.set('class', 'ocrx_word')
                word_elem.text = word.value

    # Add the image to the page
    image_elem = ET.SubElement(body, 'img')
    image_elem.set('src', image_path)
    image_elem.set('alt', '')
    image_elem.set('class', 'ocr_page')
    image_elem.set('title', 'image ' + image_path + '; page ' + str(page_num))
#     image_elem.set('title', 'image {}; page {}', format(image_path, str(page_num)))

    # Serialize the XML element to a string and return it
    hocr = ET.tostring(page, encoding='unicode')
    return hocr


def doctr_to_hocr_with_bbox(doc, image_path, iou_thresh=0.5):
    # Initialize the HOCR string with the HTML header and the OCR page element
    hocr = '''<!DOCTYPE html>
<html>
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name='ocr-system' content='doctr' />
</head>
<body>
<div class='ocr_page' id='page_1' title='image "{0}"'>
'''.format(image_path)

    # Add the OCR line and word elements to the HOCR string
    for page_num, page in enumerate(doc.pages):
        # Initialize the text and bbox lists for the page
        text_list = []
        bbox_list = []
        
        for block in page.blocks:
            # Initialize the text and bbox lists for the block
            block_text_list = []
            block_bbox_list = []
            
            for line in block.lines:
                # Initialize the text and bbox lists for the line
                line_text_list = []
                line_bbox_list = []
                
                for word in line.words:
                    # Add the text and bbox of the word to the line lists
                    line_text_list.append(word.value)
                    line_bbox_list.append(word.geometry.bbox)
                    
                # Combine the text and bbox lists for the line into a single string
                line_text = ' '.join(line_text_list)
                line_bbox = ' ; '.join(['{} {} {} {}'.format(*bbox) for bbox in line_bbox_list])

                # Add the OCR line element to the HOCR string
                line_elem = '<span class="ocr_line" title="bbox {}" >{}</span>'.format(line_bbox, line_text)
                block_text_list.append(line_text)
                block_bbox_list.append(line_bbox)
                hocr += line_elem + '\n'
            
            # Combine the text and bbox lists for the block into a single string
            block_text = ' '.join(block_text_list)
            block_bbox = ' ; '.join(['{} {} {} {}'.format(*bbox) for bbox in block_bbox_list])

            # Add the OCR block element to the HOCR string
            block_elem = '<div class="ocr_carea" title="bbox {}">{}</div>'.format(block_bbox, block_text)
            hocr += block_elem + '\n'
            
            # Add the text and bbox of the block to the page lists
            text_list.append(block_text)
            bbox_list.append(block_bbox)

        # Combine the text and bbox lists for the page into a single string
        page_text = ' '.join(text_list)
        page_bbox = ' ; '.join(['{} {} {} {}'.format(*bbox) for bbox in bbox_list])
        
        # Add the OCR page element to the HOCR string
        image_elem = '<img class="ocr_page" src="{}" title="image {}; page {}" alt="" />'.format(image_path, image_path, page_num+1)
        page_elem = '<div class="ocr_page" id="page_{}" title="image {}; bbox {}" >{}{}</div>'.format(
            page_num+1, image_path, page_bbox, image_elem, page_text)
        hocr += page_elem + '\n'

    # Add the HTML footer to the HOCR string
    hocr += '</div>\n</body>\n</html>'

    return hocr



In [57]:
result

Document(
  (pages): [Page(
    dimensions=(2200, 1700)
    (blocks): [
      Block(
        (lines): [Line(
          (words): [
            Word(value='VI.', confidence=0.99),
            Word(value='CONCLUSION', confidence=0.99),
          ]
        )]
        (artefacts): []
      ),
      Block(
        (lines): [
          Line(
            (words): [
              Word(value='In', confidence=1.0),
              Word(value='this', confidence=1.0),
              Word(value='paper,', confidence=1.0),
              Word(value='a', confidence=1.0),
              Word(value='new', confidence=1.0),
              Word(value='approach', confidence=1.0),
              Word(value='to', confidence=1.0),
              Word(value='generate', confidence=1.0),
              Word(value='algorithms', confidence=0.99),
              Word(value='with', confidence=0.53),
            ]
          ),
          Line(
            (words): [
              Word(value='chaotic', confidence=1.0),
           

In [54]:
hocr = doctr_to_hocr_with_bbox(result, image_file, 0.5)

AttributeError: 'tuple' object has no attribute 'bbox'

In [46]:
def save_hocr(hocr, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(hocr)

from bs4 import BeautifulSoup

def hocr_to_html(hocr):
    # Parse the HOCR string as HTML using BeautifulSoup
    soup = BeautifulSoup(hocr, 'html.parser')

    # Remove HOCR-specific metadata from the HTML
    for tag in soup(['ocrx_word', 'ocr_line', 'ocr_carea', 'ocr_par', 'ocr_page']):
        tag.unwrap()

    # Return the cleaned HTML
    return str(soup)

In [47]:
html = hocr_to_html(hocr)

In [48]:
save_hocr(html, 'sample.html')

In [15]:
with open('../results_mobilenet.pkl', 'rb') as f:
    results = pickle.load(f)

total = {}
for image,result in tqdm(results.items()):
    dim = tuple(reversed(result.pages[0].dimensions))
    predictions = []
    block_id = 0
    for block in result.pages[0].blocks:
        line_id = 0
        for line in block.lines:
            for word in line.words:
                values = []
                geo = word.geometry
                a = list(int(a*b) for a,b in zip(geo[0],dim))
                b = list(int(a*b) for a,b in zip(geo[1],dim))
                values.append(block_id)
                values.append(line_id)
                values.append(word.confidence)
                values.append(a[0])
                values.append(a[1])
                values.append(b[0])
                values.append(b[1])
                values.append(word.value)
                predictions.append(values)
            line_id += 1
        block_id += 1
    total[image] = predictions
    
    
for image,result in tqdm(total.items()):
    name = image[:len(image) - 4]
    df = pd.DataFrame(result, columns = ['block', 'line', 'confidence', 'X1', 'Y1', 'X2', 'Y2', 'token'])
    df.to_csv(TXT_DATA_PATH + name + '.txt', sep=' ',index=False)

100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1163.41it/s]
100%|███████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 558.33it/s]
