In [1]:
import os
from utilities import (extract_image_from_pdf, extract_xml_from_digital_pdf,
                       create_directory, read_directory_files, get_subdirectories,
                       get_string_xmltree, get_xmltree, get_specific_tags, get_page_texts_ordered,
                       get_page_text_element_attrib, get_ngram
                      )
from xml_document_info import (get_xml_info)

from box_horizontal_operations import (merge_horizontal_blocks)
from box_vertical_operations import (merge_vertical_blocks)

import pandas as pd
from PIL import Image, ImageDraw, ImageFont
import copy


## Draw the boxes on document pages

In [2]:
def draw_box(filepath, desired_width, desired_height, df, color="green", save=False):
    image  = Image.open(filepath)
    image  = image.resize((desired_width, desired_height))
    draw   = ImageDraw.Draw(image)
    
    for index, row in df.iterrows():
        left   = int(row['text_left'])
        right  = int(row['text_width'] + left)
        top    = int(row['text_top'])
        bottom = int(row["text_height"] + top)
        
        draw.rectangle(((left, top), (right,bottom)), outline=color)
    save_filepath = os.path.join(os.path.dirname(filepath), 'processed_' + os.path.basename(filepath))
    if save:
        image.save(save_filepath)
    
    return image

def show_df(df):
    return df.head(df.shape[0])

## start of program

In [3]:
base_dir   = '/Users/kd/Workspace/python/github/document-structure/data'
input_dir  = os.path.join(base_dir, 'input')
output_dir = os.path.join(base_dir, 'output')
filename   = '12_2019_9_1503_21076_Judgement_02-Mar-2020.pdf'


In [7]:
pdf_filepath   = os.path.join(input_dir, filename)
working_dir    = os.path.join(output_dir, os.path.splitext(filename)[0])

ret            = create_directory(working_dir)

pdf_image_dir  = extract_image_from_pdf(pdf_filepath, working_dir)
pdf_xml_dir    = extract_xml_from_digital_pdf(pdf_filepath, working_dir)

xml_files      = read_directory_files(pdf_xml_dir, pattern='*.xml')
# print(xml_files, "\n\n")
image_files    = read_directory_files(pdf_image_dir, pattern='*-*.jpg')
# print(image_files)
print("Extracted xml & images of file: %s" % (filename))

Extracted xml & images of file: 12_2019_9_1503_21076_Judgement_02-Mar-2020.pdf


In [8]:
xml_dfs, page_width, page_height = get_xml_info(xml_files[0])

Total number of pages (11) in file (12_2019_9_1503_21076_Judgement_02-Mar-2020.xml)


In [9]:
document_configs = {
    'HORI_BLOCK_WDTH_DIFF_PERC': 0.85,
    'SUPERSCRIPT_HEIGHT_DIFFERENCE': 7.0,
    'HORI_SPACE_TOO_CLOSE': 10.0,
    
    'VERTICAL_SPACE_TOO_CLOSE': 5.0,
    'AVERAGE_VERTICAL_SPACE': 12.0,
    'LEFT_OR_RIGHT_ALIGNMENT_MARGIN': 20.0
}

In [24]:
file_index     = 0

img_filepath   = image_files[file_index]
df             = xml_dfs[file_index]


In [34]:
in_df   = df.loc[:]

h_df    = merge_horizontal_blocks(in_df, document_configs, debug=False)
v_df    = merge_vertical_blocks(h_df, document_configs, debug=False)

# draw_box(img_filepath, page_width, page_height, v_df, color='red')
show_df(v_df)


Unnamed: 0,xml_index,text_top,text_left,text_width,text_height,text,font_size,font_family,font_color,children
0,0,54,454,10,19,1,16,BAAAAA+DejaVuSans,#000000,
1,1,109,668,142,23,REPORTABLE,21,CAAAAA+LiberationSerif,#000000,
2,2,157,274,370,72,IN THE SUPREME COURT OF INDIA CRIMINAL APPELLA...,21,DAAAAA+LiberationSerif,#000000,"{""xml_index"":{""2"":2,""3"":3},""text_top"":{""2"":157..."
3,4,254,281,356,23,CRIMINAL APPEAL NO. 353 OF 2020,21,CAAAAA+LiberationSerif,#000000,
4,5,350,108,663,120,Satishkumar Nyalchand Shah .. Appellant Versus...,21,DAAAAA+LiberationSerif,#000000,"{""xml_index"":{""5"":5,""6"":7,""7"":8},""text_top"":{""..."
5,10,544,108,429,119,"J U D G M E N T M.R. Shah, J.",21,CAAAAA+LiberationSerif,#000000,"{""xml_index"":{""8"":10,""9"":11},""text_top"":{""8"":5..."
6,12,713,108,702,264,Feeling aggrieved and dissatisfied with the im...,21,DAAAAA+LiberationSerif,#000000,"{""xml_index"":{""10"":12,""11"":13,""12"":14,""13"":15,..."
7,24,968,78,69,7,Signature Not Verified,7,Helvetica,#000000,
8,18,986,78,731,35,Digitally signed by MEENAKSHI KOHLI Date: 2020...,21,Helvetica,#000000,
