In [1]:
import os
from utilities import (extract_image_from_pdf, extract_xml_from_digital_pdf, 
                        create_directory, read_directory_files, get_subdirectories,
                        get_string_xmltree, get_xmltree, get_specific_tags, get_page_texts_ordered,
                       get_page_text_element_attrib, get_ngram
                       )
from xml_document_info import (get_xml_info)
from box_spacings_operation import (update_horizontal_spacings_v1, update_vertical_spacings_v1)
from box_conditions_evaluation import (are_vlines_too_close, are_vlines_close_enough,
                                       are_hlines_too_close, are_hlines_close_enough,
                                       are_hlines_aligned, are_lines_fonts_similar,
                                       arrange_grouped_line_indices, are_hlines_superscript, get_lines_upper_lower,
                                       are_hlines, are_vlines, are_vlines_zero_overlap, are_vlines_full_overlap)

from box_horizontal_operation import (merge_hori_boxes_close, merge_hori_boxes)

import pandas as pd
from PIL import Image, ImageDraw, ImageFont
import copy


## Draw the boxes on document pages

In [2]:
def draw_box(filepath, desired_width, desired_height, df, color="green", save=False):
    image  = Image.open(filepath)
    image  = image.resize((desired_width, desired_height))
    draw   = ImageDraw.Draw(image)
    
    for index, row in df.iterrows():
        left   = int(row['text_left'])
        right  = int(row['text_width'] + left)
        top    = int(row['text_top'])
        bottom = int(row["text_height"] + top)
        
        draw.rectangle(((left, top), (right,bottom)), outline=color)
    save_filepath = os.path.join(os.path.dirname(filepath), 'processed_' + os.path.basename(filepath))
    if save:
        image.save(save_filepath)
    
    return image

def show_df(df):
    return df.head(df.shape[0])

## start of program

In [3]:
base_dir   = '/Users/kd/Workspace/python/github/document-structure/data'
input_dir  = os.path.join(base_dir, 'input')
output_dir = os.path.join(base_dir, 'output')
filename   = '20695_2010_8_1501_20635_Judgement_17-Feb-2020.pdf'


In [4]:
pdf_filepath   = os.path.join(input_dir, filename)
working_dir    = os.path.join(output_dir, os.path.splitext(filename)[0])

ret            = create_directory(working_dir)

pdf_image_dir  = extract_image_from_pdf(pdf_filepath, working_dir)
pdf_xml_dir    = extract_xml_from_digital_pdf(pdf_filepath, working_dir)

xml_files      = read_directory_files(pdf_xml_dir, pattern='*.xml')
print(xml_files, "\n\n")

image_files    = read_directory_files(pdf_image_dir, pattern='*-*.jpg')
print(image_files)

Extracted pdf images [/Users/kd/Workspace/python/github/document-structure/data/output/20695_2010_8_1501_20635_Judgement_17-Feb-2020/images] 


Extracted xml of digital PDF [/Users/kd/Workspace/python/github/document-structure/data/output/20695_2010_8_1501_20635_Judgement_17-Feb-2020/pdttohtml] 


['/Users/kd/Workspace/python/github/document-structure/data/output/20695_2010_8_1501_20635_Judgement_17-Feb-2020/pdttohtml/20695_2010_8_1501_20635_Judgement_17-Feb-2020.xml'] 


['/Users/kd/Workspace/python/github/document-structure/data/output/20695_2010_8_1501_20635_Judgement_17-Feb-2020/images/20695_2010_8_1501_20635_Judgement_17-Feb-20200001-01.jpg', '/Users/kd/Workspace/python/github/document-structure/data/output/20695_2010_8_1501_20635_Judgement_17-Feb-2020/images/20695_2010_8_1501_20635_Judgement_17-Feb-20200001-02.jpg', '/Users/kd/Workspace/python/github/document-structure/data/output/20695_2010_8_1501_20635_Judgement_17-Feb-2020/images/20695_2010_8_1501_20635_Judgement_17-Feb-202000

In [5]:
xml_dfs, page_width, page_height = get_xml_info(xml_files[0])

Total number of pages (54) in file (20695_2010_8_1501_20635_Judgement_17-Feb-2020.xml)


In [6]:
document_configs = {
    'HORI_BLOCK_WDTH_DIFF_PERC': 0.85,
    'SUPERSCRIPT_HEIGHT_DIFFERENCE': 7.0,
    'HORI_SPACE_TOO_CLOSE': 10.0,
    
    'VERTICAL_SPACE_TOO_CLOSE': 5.0,
    'AVERAGE_VERTICAL_SPACE': 12.0,
    'LEFT_OR_RIGHT_ALIGNMENT_MARGIN': 20.0
}

## Vertical box merging operations

In [7]:
def merge_vert_boxes(df, boxes, debug=False):
    t_ts       = []
    t_ls       = []
    t_ws       = []
    t_hs       = []
    texts      = []
    f_sizes    = []
    f_familys  = []
    f_colors   = []
    
    if debug:
        print('merge_vert_boxes: %s \n---------\n' % (str(boxes)))
        
    for box_item in boxes:
        line_indices, connection_type = box_item
        
        if connection_type == 'NOT_CONNECTED':
            for line_index in line_indices:
                t_ts.append(df.iloc[line_index]['text_top'])
                t_ls.append(df.iloc[line_index]['text_left'])
                t_hs.append(df.iloc[line_index]['text_height'])
                t_ws.append(df.iloc[line_index]['text_width'])
                texts.append(df.iloc[line_index]['text'])
                f_sizes.append(df.iloc[line_index]['font_size'])
                f_familys.append(df.iloc[line_index]['font_family'])
                f_colors.append(df.iloc[line_index]['font_color'])
        else:
            first_line_index = line_indices[0]
            last_line_index  = line_indices[-1]
            
            t_ts.append(df.iloc[first_line_index]['text_top'])
            
            min_left         = df.loc[first_line_index:last_line_index, 'text_left'].min()
            t_ls.append(min_left)

            t_hs.append(df.iloc[last_line_index]['text_top'] + df.iloc[last_line_index]['text_height'] - df.iloc[first_line_index]['text_top'])

            max_width        = max(df.iloc[first_line_index]['text_left']+df.iloc[first_line_index]['text_width'],
                                  df.iloc[last_line_index]['text_left']+df.iloc[last_line_index]['text_width']) - min_left
            t_ws.append(max_width)

            connected_text = ''
            for line_index in line_indices:
                connected_text = connected_text +  df.iloc[line_index]['text'] + ' '
            texts.append(connected_text)
            f_sizes.append(df.iloc[first_line_index]['font_size'])
            f_familys.append(df.iloc[first_line_index]['font_family'])
            f_colors.append(df.iloc[first_line_index]['font_color'])

    box_df = pd.DataFrame(list(zip(t_ts, t_ls, t_ws, t_hs, texts, 
                                   f_sizes, f_familys, f_colors)),
                          columns =['text_top', 'text_left', 'text_width', 'text_height', 'text', 
                                    'font_size', 'font_family', 'font_color'])
    
    box_df = update_horizontal_spacings_v1(box_df)
    box_df = update_vertical_spacings_v1(box_df)
    
    return box_df


def merge_vert_boxes_close(df, configs, debug=False):
    new_df             = df.copy()
    new_df             = new_df.reset_index(drop=True)
    
    connections        = []
    index_grams        = get_ngram(list(new_df.index.values), window_size=2)
    for index_gram in index_grams:
        if are_lines_fonts_similar(new_df, configs, index_gram[0], index_gram[1], debug=debug) \
        :
            connections.append((index_gram[1], index_gram[0], 'CONNECTED'))
        else:
            connections.append((index_gram[1], index_gram[0], 'NOT_CONNECTED'))
    
    if debug:
        print("line connections (merge_vert_boxes_close) : %s \n----\n" % (str(connections)))
    
    grouped_lines = arrange_grouped_line_indices(connections, debug=debug)
    new_df        = merge_vert_boxes(new_df, grouped_lines, debug=debug)
    
    '''
        - too close vertical lines have been merged till here
        - checking for overlap condition
    '''
    
#     connections        = []
#     index_grams        = get_ngram(list(new_df.index.values), window_size=2)
#     for index_gram in index_grams:
#         if are_lines_fonts_similar(new_df, configs, index_gram[0], index_gram[1], debug=debug) \
#             and \
#             are_vlines(new_df, configs, index_gram[0], index_gram[1], debug=debug) \
#         :
#             connections.append((index_gram[1], index_gram[0], 'CONNECTED'))
#         else:
#             connections.append((index_gram[1], index_gram[0], 'NOT_CONNECTED'))
    
#     grouped_lines = arrange_grouped_line_indices(connections, debug=debug)
#     box_df        = merge_vert_boxes(new_df, grouped_lines, debug=debug)
#     print('total records: %d, after merging records %d' % (new_df.shape[0], box_df.shape[0]))
    
    return new_df



In [8]:
def merge_vert_boxes_add_merge_data(df, boxes, debug=False):
    t_ts       = []
    t_ls       = []
    t_ws       = []
    t_hs       = []
    texts      = []
    f_sizes    = []
    f_familys  = []
    f_colors   = []
    merge_data=[]
    
    if debug:
        print('merge_vert_boxes: %s \n---------\n' % (str(boxes)))
        
    for box_item in boxes:
        line_indices, connection_type = box_item
        
        if connection_type == 'NOT_CONNECTED':
            for line_index in line_indices:
                t_ts.append(df.iloc[line_index]['text_top'])
                t_ls.append(df.iloc[line_index]['text_left'])
                t_hs.append(df.iloc[line_index]['text_height'])
                t_ws.append(df.iloc[line_index]['text_width'])
                texts.append(df.iloc[line_index]['text'])
                f_sizes.append(df.iloc[line_index]['font_size'])
                f_familys.append(df.iloc[line_index]['font_family'])
                f_colors.append(df.iloc[line_index]['font_color'])
                merge_data.append(df.iloc[line_index:line_index+1,:].copy())
        else:
            first_line_index = line_indices[0]
            last_line_index  = line_indices[-1]
            
            t_ts.append(df.iloc[line_indices,:].text_top.min())
            
            min_left         = df.iloc[line_indices].text_left.min()
            t_ls.append(min_left)

            t_hs.append(df.iloc[last_line_index]['text_top'] + df.iloc[last_line_index]['text_height']\
                        - df.iloc[first_line_index]['text_top'])

            max_width        = (df.iloc[line_indices,:]['text_left']+df.iloc[line_indices,:]['text_width']).max()-min_left
            t_ws.append(max_width)

            connected_text = ''
            for line_index in line_indices:
                connected_text = connected_text +  df.iloc[line_index]['text'] + ' '
            texts.append(connected_text)
            f_sizes.append(df.iloc[first_line_index]['font_size'])
            f_familys.append(df.iloc[first_line_index]['font_family'])
            f_colors.append(df.iloc[first_line_index]['font_color'])
            merge_data.append((df.iloc[line_indices,:]).copy().reset_index(drop=True))    #Have reset the index here

    box_df = pd.DataFrame(list(zip(t_ts, t_ls, t_ws, t_hs, texts,merge_data,
                                   f_sizes, f_familys, f_colors)),
                          columns =['text_top', 'text_left', 'text_width', 'text_height', 'text','merge_data',\
                                    'font_size', 'font_family', 'font_color'])
    
    box_df = update_horizontal_spacings_v1(box_df)
    box_df = update_vertical_spacings_v1(box_df)
    
    return box_df

def merge_vert_boxes_same_font(df, configs, debug=False):
    new_df             = df.copy()
    new_df             = new_df.reset_index(drop=True)
    
    connections        = []
    index_grams        = get_ngram(list(new_df.index.values), window_size=2)
    for index_gram in index_grams:
        if are_lines_fonts_similar(new_df, configs, index_gram[0], index_gram[1], debug=debug):
            connections.append((index_gram[1], index_gram[0], 'CONNECTED'))
        else:
            connections.append((index_gram[1], index_gram[0], 'NOT_CONNECTED'))
    
    if debug:
        print("line connections (merge_vert_boxes_close) : %s \n----\n" % (str(connections)))
    
    grouped_lines = arrange_grouped_line_indices(connections, debug=debug)
    new_df        = merge_vert_boxes_add_merge_data(new_df, grouped_lines, debug=debug)
    return new_df

In [9]:
file_index = 3

img        = image_files[file_index]
df         = xml_dfs[file_index]


In [11]:
h_df = merge_hori_boxes_close(df, document_configs, debug=True)

#v_df = merge_vert_boxes_close(h_df, document_configs, debug=False)
#h_df
# f_df= merge_vert_boxes_same_font(h_df, document_configs, debug=False)

# draw_box(img, page_width, page_height, h_df, color='red')
show_df(h_df)

superscript:: superscript_index: 7, text: 6
By  a  notification 5 

superscript:: superscript_index: 11, text: 10
“(i)  Corps of Signals,  

superscript:: superscript_index: 23, text: 22
was extended by a notification 6 

superscript:: superscript_index: 26, text: 25
of  Defence 7 

superscript:: superscript_index: 31, text: 30
On 28 October 2005,  a notification 8 

superscript:: superscript_index: 41, text: 40
Scheme (Officers) 9 

are_hlines:: idx1: 0, idx2: 1, space: 1141
are_hlines:: idx1: 1, idx2: 2, space: 1085
are_hlines:: idx1: 2, idx2: 3, space: 43
are_hlines:: idx1: 3, idx2: 4, space: 43
are_hlines:: idx1: 4, idx2: 5, space: 65
are_hlines:: idx1: 5, idx2: 6, space: 5
are_hlines_too_close:: idx1: 5, idx2: 6, space: 33
are_hlines:: idx1: 6, idx2: 7, space: 0
are_hlines:: idx1: 7, idx2: 8, space: 5
are_hlines:: idx1: 8, idx2: 9, space: 43
are_hlines:: idx1: 9, idx2: 10, space: 39
are_hlines:: idx1: 10, idx2: 11, space: 0
are_hlines_too_close:: idx1: 10, idx2: 11, space: 30
are_

Unnamed: 0,text_top,text_left,text_width,text_height,text,font_size,font_family,font_color,horizontal_space,vertical_space
0,56,730,58,14,PART A,14,Arial,#000000,0,0
1,112,108,682,17,This notification was to remain in force for a...,18,Arial,#000000,0,42
2,155,108,682,17,which it was published in the official ...,18,Arial,#000000,0,26
3,198,108,260,17,Gazette on 15 February 1992.,18,Arial,#000000,0,26
4,258,162,628,22,By a notification 5 dated 31 December 1...,18,Arial,#000000,0,43
5,263,108,21,17,4.,18,Arial,#000000,0,-17
6,306,108,558,17,enrollment in the following corps/departments ...,18,Arial,#000000,0,26
7,345,216,24,18,“(i),15,Arial,#000000,0,22
8,345,270,124,18,"Corps of Signals,",15,Arial,#000000,30,22
9,369,216,22,15,(ii),15,Arial,#000000,0,6
