 # Layout_analysis_fn.py Control file

Calls the function get_lines_and_info(<filepath.pdf>,<(optional: Int for max no of pages>) from  Eyzo_layout_analysis_fn.py. 

Returns a dataframe containing all: 
```[pages numbers, box numbers, box positions, line numbers, previous line fontsize (non-spacelike), currentline fontsize (non-spacelike), text of line]```
contained in the TextBoxes of pdfminer.six layout tree, see: [pdfminer.six documentation](https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_text.html)

In [33]:
"""
Function that takes a pdf file and runs it trough pdfminer.six to parse the
LTTextBox objects of the LAParams class returning a TODO: dataframe or dict?
containing the page numbers, box numbers, box position, line numbers, font
size of current and previous line in the text. See documentation at:
https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_text.html
"""

from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTChar, LTTextBoxHorizontal
import pandas as pd

#set maxpage to number to only parse part of file for debugging
def get_lines_and_info(filename,maxpage=False):

    file = open(filename,'rb')
    columns = [
            'page_no',
            'box_no',
            'box_pos',
            'line_no',
            'previous_linefontsize',
            'curr_linefontsize',
            'text'
    ]

    data = {key : [] for key in columns}
    fontsize = None #Set fontsize to none for previous_linefontsize on 1st entry
    pagenumber = 0

    # parse over all textboxes and lines contained in them
    for page_layout in extract_pages(file):
        pagenumber += 1
        for element in page_layout:
            if isinstance(element, LTTextBoxHorizontal):
                lineno=0
                for text_line in element:
                    lineno += 1
                    #output line data
                    data['page_no'].append(pagenumber)
                    data['box_no'].append(element.index)
                    data['box_pos'].append(element.bbox)
                    data['line_no'].append(lineno)
                    data['previous_linefontsize'].append(fontsize)
                    data['text'].append(text_line.get_text())
                    
                    #Recording fontsize of first nonspace character in line
                    for character in text_line:
                        if isinstance(character,LTChar):
                            if character.get_text().isspace() :
                                fontsize = None
                                continue
                            else:
                                #print('fontsize {}'.format(character.size))
                                ##print('firstchar {}'.format(character))
                                fontsize = character.size
                                data['curr_linefontsize'].append(fontsize)
                                break
                        data['curr_linefontsize'].append(fontsize)
        if pagenumber>maxpage and maxpage != False:
            break

    #populate dataframe with Data
    df = pd.DataFrame(data)
    df.name = filename
    print(df.head())
    return df

In [34]:
#from Eyzo_layout_analysis_fn import get_lines_and_info

infile = 'Tier-2-5-sponsor-guidance_Jul-2020_v1.0.pdf'
df = get_lines_and_info(infile,12)

   page_no  box_no                                          box_pos  line_no  \
0        1       0  (70.944, 658.837, 513.5191199999999, 761.78896)        1   
1        1       0  (70.944, 658.837, 513.5191199999999, 761.78896)        2   
2        1       0  (70.944, 658.837, 513.5191199999999, 761.78896)        3   
3        1       0  (70.944, 658.837, 513.5191199999999, 761.78896)        4   
4        1       1     (70.944, 522.37, 525.5520000000005, 657.988)        1   

   previous_linefontsize  curr_linefontsize  \
0                    NaN                NaN   
1                    NaN                NaN   
2                    NaN              26.04   
3                  26.04              26.04   
4                  26.04                NaN   

                                       text  
0                                        \n  
1                                        \n  
2  Tier 2 and 5: Guidance for Sponsors - \n  
3                              Addendum  \n  
4     

In [35]:
df

Unnamed: 0,page_no,box_no,box_pos,line_no,previous_linefontsize,curr_linefontsize,text
0,1,0,"(70.944, 658.837, 513.5191199999999, 761.78896)",1,,,\n
1,1,0,"(70.944, 658.837, 513.5191199999999, 761.78896)",2,,,\n
2,1,0,"(70.944, 658.837, 513.5191199999999, 761.78896)",3,,26.04,Tier 2 and 5: Guidance for Sponsors - \n
3,1,0,"(70.944, 658.837, 513.5191199999999, 761.78896)",4,26.04,26.04,Addendum \n
4,1,1,"(70.944, 522.37, 525.5520000000005, 657.988)",1,26.04,,\n
...,...,...,...,...,...,...,...
562,13,18,"(70.944, 89.69999999999999, 519.976, 248.96)",10,15.96,12.00,Information on how to complete a certificate o...
563,13,18,"(70.944, 89.69999999999999, 519.976, 248.96)",11,12.00,12.00,complete an application for a restricted certi...
564,13,19,"(70.944, 34.02, 144.38888, 58.083)",1,12.00,9.96,Page 13 of 209 \n
565,13,19,"(70.944, 34.02, 144.38888, 58.083)",2,9.96,,\n


NoneType