 # Layout_analysis_fn.py Control file

Calls the function get_lines_and_info(<filepath.pdf>,<(optional: Int for max no of pages>) from  Eyzo_layout_analysis_fn.py. 

Returns a dict containing: 
```[pages numbers, box numbers, box positions, line numbers, previous line fontsize (non-spacelike), currentline fontsize (non-spacelike), text of line]```
contained in the TextBoxes of pdfminer.six layout tree, see: [pdfminer.six documentation](https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_text.html)

In [1]:
import pandas as pd
from Eyzo_layout_analysis_fn import get_lines_and_info
import time

infile = 'Tier-2-5-sponsor-guidance_Jul-2020_v1.0.pdf'

In [2]:
#from Eyzo_layout_analysis_fn import get_lines_and_info
start = time.time()
out_dict = get_lines_and_info(infile,maxpage=12)
print(' --- {} --- '.format(time.time()-start))
df = pd.DataFrame(out_dict)
df.name = infile
df

 --- 9.906467199325562 --- 


Unnamed: 0,page_no,box_no,box_pos,line_no,line_pos,previous_linefontsize,curr_linefontsize,text
0,1,0,"(70.944, 658.837, 513.5191199999999, 761.78896)",1,"(70.944, 735.74896, 78.18312, 761.78896)",,,\n
1,1,0,"(70.944, 658.837, 513.5191199999999, 761.78896)",2,"(70.944, 719.01904, 75.38088, 734.9790399999999)",,,\n
2,1,0,"(70.944, 658.837, 513.5191199999999, 761.78896)",3,"(70.944, 688.837, 513.5191199999999, 714.877)",,26.04,Tier 2 and 5: Guidance for Sponsors - \n
3,1,0,"(70.944, 658.837, 513.5191199999999, 761.78896)",4,"(70.944, 658.837, 211.28912000000003, 684.877)",26.04,26.04,Addendum \n
4,1,1,"(70.944, 522.37, 525.5520000000005, 657.988)",1,"(70.944, 645.988, 74.28, 657.988)",26.04,,\n
...,...,...,...,...,...,...,...,...
562,13,18,"(70.944, 89.69999999999999, 519.976, 248.96)",10,"(70.944, 103.5, 480.3720000000004, 115.5)",15.96,12.00,Information on how to complete a certificate o...
563,13,18,"(70.944, 89.69999999999999, 519.976, 248.96)",11,"(70.944, 89.69999999999999, 484.40599999999995...",12.00,12.00,complete an application for a restricted certi...
564,13,19,"(70.944, 34.02, 144.38888, 58.083)",1,"(70.944, 47.61504, 144.38888, 58.083)",12.00,9.96,Page 13 of 209 \n
565,13,19,"(70.944, 34.02, 144.38888, 58.083)",2,"(70.944, 34.02, 74.28, 46.02)",9.96,,\n


## Getting fontsizes

First we round down to get rid of any trailing decimals, the original data contains things like 11.99999999992 which should be fontsize 12. I kill all decimals decimal 

In [81]:
out = df['line_no'].groupby(df['curr_linefontsize'].round(decimals = 0)).count()

In [82]:
out

curr_linefontsize
10.0     24
12.0    456
16.0      6
26.0      5
Name: line_no, dtype: int64

## Extracting the coordinates into separate columns for easy handling

In [44]:
df[['x0_bx','y0_bx','x1_bx','y1_bx']] = pd.DataFrame(df['box_pos'].tolist())
df[['x0_ln','y0_ln','x1_ln','y1_ln']] = pd.DataFrame(df['line_pos'].tolist())

In [45]:
df['y1_ln'].groupby(df['page_no'])

Unnamed: 0,page_no,box_no,box_pos,line_no,line_pos,previous_linefontsize,curr_linefontsize,text,x0_bx,y0_bx,x1_bx,y1_bx,x0_ln,y0_ln,x1_ln,y1_ln
0,1,0,"(70.944, 658.837, 513.5191199999999, 761.78896)",1,"(70.944, 735.74896, 78.18312, 761.78896)",,,\n,70.944,658.83700,513.51912,761.78896,70.944,735.74896,78.18312,761.78896
1,1,0,"(70.944, 658.837, 513.5191199999999, 761.78896)",2,"(70.944, 719.01904, 75.38088, 734.9790399999999)",,,\n,70.944,658.83700,513.51912,761.78896,70.944,719.01904,75.38088,734.97904
2,1,0,"(70.944, 658.837, 513.5191199999999, 761.78896)",3,"(70.944, 688.837, 513.5191199999999, 714.877)",,26.04,Tier 2 and 5: Guidance for Sponsors - \n,70.944,658.83700,513.51912,761.78896,70.944,688.83700,513.51912,714.87700
3,1,0,"(70.944, 658.837, 513.5191199999999, 761.78896)",4,"(70.944, 658.837, 211.28912000000003, 684.877)",26.04,26.04,Addendum \n,70.944,658.83700,513.51912,761.78896,70.944,658.83700,211.28912,684.87700
4,1,1,"(70.944, 522.37, 525.5520000000005, 657.988)",1,"(70.944, 645.988, 74.28, 657.988)",26.04,,\n,70.944,522.37000,525.55200,657.98800,70.944,645.98800,74.28000,657.98800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,13,18,"(70.944, 89.69999999999999, 519.976, 248.96)",10,"(70.944, 103.5, 480.3720000000004, 115.5)",15.96,12.00,Information on how to complete a certificate o...,70.944,89.70000,519.97600,248.96000,70.944,103.50000,480.37200,115.50000
563,13,18,"(70.944, 89.69999999999999, 519.976, 248.96)",11,"(70.944, 89.69999999999999, 484.40599999999995...",12.00,12.00,complete an application for a restricted certi...,70.944,89.70000,519.97600,248.96000,70.944,89.70000,484.40600,101.70000
564,13,19,"(70.944, 34.02, 144.38888, 58.083)",1,"(70.944, 47.61504, 144.38888, 58.083)",12.00,9.96,Page 13 of 209 \n,70.944,34.02000,144.38888,58.08300,70.944,47.61504,144.38888,58.08300
565,13,19,"(70.944, 34.02, 144.38888, 58.083)",2,"(70.944, 34.02, 74.28, 46.02)",9.96,,\n,70.944,34.02000,144.38888,58.08300,70.944,34.02000,74.28000,46.02000


## Tryout Dataframe operations

In [27]:
frame = pd.DataFrame({'a':[1,2,3,4],'b':[(1,2),(2,2),(2,1),(1,1)],'c':[False,False,True,True],'d':[10,10,12,12],'e':['a','b','a','b']})

In [28]:
frame

Unnamed: 0,a,b,c,d,e
0,1,"(1, 2)",False,10,a
1,2,"(2, 2)",False,10,b
2,3,"(2, 1)",True,12,a
3,4,"(1, 1)",True,12,b


In [29]:
frame['b'].sort_values(ascending = False)

1    (2, 2)
2    (2, 1)
0    (1, 2)
3    (1, 1)
Name: b, dtype: object

In [32]:
grp = frame['a'].groupby(frame['e'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x1105e9250>

In [40]:
frame[['x0_bl','y0_bl']] = pd.DataFrame(frame['b'].tolist())

In [53]:
frame['a'].count()

4

In [54]:
frame.groupby('e').count()

Unnamed: 0_level_0,a,b,c,d,x0_bl,y0_bl
e,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,2,2,2,2,2,2
b,2,2,2,2,2,2
