In [1]:
import os
from utilities import (extract_image_from_pdf, extract_xml_from_digital_pdf,
                       create_directory, read_directory_files, get_subdirectories,
                       get_string_xmltree, get_xmltree, get_specific_tags, get_page_texts_ordered,
                       get_page_text_element_attrib, get_ngram
                      )
from xml_document_info import (get_xml_info, get_xml_image_info)

from box_horizontal_operations import (merge_horizontal_blocks)
from box_vertical_operations import (merge_vertical_blocks)

import pandas as pd
from PIL import Image, ImageDraw, ImageFont
import copy

from ocr_text_utilities import extract_text_from_image

## Draw the boxes on document pages

In [2]:
def draw_box(filepath, desired_width, desired_height, df, color="green", save=False):
    image  = Image.open(filepath)
    image  = image.resize((desired_width, desired_height))
    draw   = ImageDraw.Draw(image)
    
    for index, row in df.iterrows():
        left   = int(row['text_left'])
        right  = int(row['text_width'] + left)
        top    = int(row['text_top'])
        bottom = int(row["text_height"] + top)
        
        draw.rectangle(((left, top), (right,bottom)), outline=color)
    save_filepath = os.path.join(os.path.dirname(filepath), 'processed_' + os.path.basename(filepath))
    if save:
        image.save(save_filepath)
    
    return image

def show_df(df):
    return df.head(df.shape[0])

## start of program

In [3]:
base_dir   = '/Users/kd/Workspace/python/github/document-structure/data/'
input_dir  = os.path.join(base_dir, 'input')
output_dir = os.path.join(base_dir, 'output')
filename   = '1506.02640.pdf'


In [4]:
pdf_filepath   = os.path.join(input_dir, filename)
working_dir    = os.path.join(output_dir, os.path.splitext(filename)[0])

ret            = create_directory(working_dir)

pdf_image_dir  = extract_image_from_pdf(pdf_filepath, working_dir)
pdf_xml_dir    = extract_xml_from_digital_pdf(pdf_filepath, working_dir)

xml_files      = read_directory_files(pdf_xml_dir, pattern='*.xml')
# print(xml_files, "\n\n")
image_files    = read_directory_files(pdf_image_dir, pattern='*-*.jpg')
# print(image_files)
print("Extracted xml & images of file: %s" % (filename))

Extracted xml & images of file: 1506.02640.pdf


In [6]:
xml_dfs, page_width, page_height = get_xml_info(xml_files[0])
img_dfs, page_width, page_height = get_xml_image_info(xml_files[0])


Total number of pages (10) in file (1506.02640.xml)
Total number of pages (10) in file (1506.02640.xml)


In [7]:
document_configs = {
    'LANGUAGE_TYPE': 'eng',
    
    'HORI_BLOCK_WDTH_DIFF_PERC': 0.85,
    'SUPERSCRIPT_HEIGHT_DIFFERENCE': 7.0,
    'HORI_SPACE_TOO_CLOSE': 10.0,
    
    'VERTICAL_SPACE_TOO_CLOSE': 5.0,
    'AVERAGE_VERTICAL_SPACE': 12.0,
    'LEFT_OR_RIGHT_ALIGNMENT_MARGIN': 20.0
}

In [8]:
file_index     = 0
img_filepath   = image_files[file_index]
df             = xml_dfs[file_index]
img_df         = img_dfs[file_index]

In [9]:
show_df(img_df)

Unnamed: 0,index,text_top,text_left,text_width,text_height,base64,attrib
0,0,294,463,111,71,b'iVBORw0KGgoAAAANSUhEUgAAArgAAAHACAIAAADGO4xJ...,IMAGE
1,1,294,707,111,71,b'iVBORw0KGgoAAAANSUhEUgAAArgAAAHACAIAAADGO4xJ...,IMAGE


In [31]:
in_df   = df #.loc[10:30]

h_df    = merge_horizontal_blocks(in_df, document_configs, debug=False)

if document_configs['LANGUAGE_TYPE']  == 'hin':
    h_df  = extract_text_from_image(img_filepath, page_width, page_height, h_df, 
                                    lang=document_configs['LANGUAGE_TYPE'])

# v_df    = merge_vertical_blocks(h_df, document_configs, debug=False)

# draw_box(img_filepath, page_width, page_height, new_df, color='red')
# show_df(h_df)


Unnamed: 0,xml_index,text_top,text_left,text_width,text_height,text,font_size,font_family,font_color,attrib,children
0,0,55,116,699,19,भाग II खण् ड (ii) भारत का रािपर : असाधारण 3,14,ABCDEE+Arial Unicode MS,#000000,,"{""index"":{""0"":0,""1"":1,""2"":2,""3"":3,""4"":4,""5"":5}..."
1,6,93,108,707,21,"का.आ. 1728(अ), किनांक 24 तई, 2019 के का.आ. 180...",16,ABCDEE+Arial Unicode MS,#000000,,
2,7,122,108,711,21,"किनांक 15 िुलाई, 2019 के का.आ. 2525 (अ) ...",16,ABCDEE+Arial Unicode MS,#000000,,
3,8,151,108,707,21,"29 अगस्ट्त, 2019 के का.आ. 3141 (अ) और किनांक 0...",16,ABCDEE+Arial Unicode MS,#000000,,
4,9,180,108,711,21,"2019 के का.आ. 3845 (अ) और किनांक 08 नवम्बर, 20...",16,ABCDEE+Arial Unicode MS,#000000,,
5,10,209,108,547,21,"का.आ. 4615 (अ) तर्ा 24 िनवरी, 2020 का का.आ. 35...",16,ABCDEE+Arial Unicode MS,#000000,,
6,11,266,253,419,13,MINISTRY OF AGRICULTURE AND FARMERS WELFARE,14,"Times New Roman,Bold",#000000,,
7,12,292,253,416,13,"(Department of Agriculture, Cooperation and Fa...",14,"Times New Roman,Bold",#000000,,
8,13,318,402,117,13,NOTIFICATION,14,"Times New Roman,Bold",#000000,,
9,14,343,355,212,13,"New Delhi, the 31st January, 2020",14,Times New Roman,#000000,,


In [29]:
children_df = pd.read_json(h_df.iloc[10]['children'])
children_df


Unnamed: 0,attrib,font_color,font_family,font_size,index,text,text_height,text_left,text_top,text_width,xml_index
0,,#000000,"Times New Roman,Bold",14,0,S.O. 488(E).—,13,171,369,95,15
1,,#000000,Times New Roman,14,1,In exercise of the powers conferred by ...,13,266,369,547,16


In [30]:
draw_box(img_filepath, page_width, page_height, p_df, color='green')

NameError: name 'p_df' is not defined

In [None]:
def transform_dataframe(df):
    for index, row in df.iterrows():
        