# Imports

In [119]:
import re
import os
import pandas as pd
from pprint import pprint
import time

import utility.utility as util
from utility.Extractor import PageNumberExtractor, PageTextExtractor
import utility.text_cleaning as tc
import utility.extractor_meta as em
from datetime import datetime

# Progress Bars:
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# Below import and instructions simply for display
from IPython.display import display, HTML, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))

# resets import once changes have been applied
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Paths

In [2]:
path_cwd = os.getcwd()
path_data = os.path.join(path_cwd, 'data')
path_raw_pdf = os.path.join(path_data, 'raw_pdf_files')
path_raw_txt = os.path.join(path_data, 'raw_text_files')
path_output = os.path.join(path_cwd, 'output')

## File paths

In [3]:
paths_pdf_files = {file.split('.')[0]: os.path.join(path_raw_pdf, file) for file in os.listdir(path_raw_pdf) if file != '.gitkeep'}
ids = list(paths_pdf_files.keys())

# Prepare Sections Anchors

In [129]:
section_anchors = {'notes': em._notes_sections, 'auditor': em._auditor_sections}
processed_section_anchors = util.process_section_anchors(section_anchors)

# Create Extractors

In [130]:
page_num_extractors = {k: PageNumberExtractor(k, v, processed_section_anchors, 0.5, False, False, False) for k, v in tqdm(paths_pdf_files.items())}

100%|███████████████████████████████████████████████████████████| 183/183 [00:00<00:00, 91604.92it/s]


In [131]:
results = []
for id in tqdm(page_num_extractors):
    e = page_num_extractors[id]
    results.append(e.run())
    del e
result_df = pd.DataFrame(results)

100%|██████████████████████████████████████████████████████████████| 183/183 [05:48<00:00,  1.91s/it]


In [132]:
test_df = pd.read_excel(os.path.join(path_data, 'testing_meta', 'sample_8april.xlsx'))
test_df = test_df['doc_id audit_std_page notes_std_page done'.split()].copy()
test_df['audit_std_page'] = pd.to_numeric(test_df['audit_std_page'], errors='coerce').astype('Int64') - 1 
test_df['notes_std_page'] = pd.to_numeric(test_df['notes_std_page'], errors='coerce').astype('Int64') - 1
test_df['doc_id'] = test_df['doc_id'].astype(str)
test_df

Unnamed: 0,doc_id,audit_std_page,notes_std_page,done
0,23032471,55,60,Hala
1,23451950,9,18,Hala
2,23855392,19,22,Ulvi
3,60297290,32,38,Ulvi
4,60490595,43,52,Ulvi
...,...,...,...,...
178,62865456,47,46,Sara
179,62905629,57,66,Sara
180,62953988,196,,Sara
181,62972987,2,9,Sara


In [133]:
merged = pd.merge(result_df, test_df, on='doc_id', how='left')


In [134]:
merged['hit_notes'] = merged.apply(lambda x: (x.notes_std_page in x.notes) if pd.notna(x.notes_std_page) else False, axis=1)
merged['hit_audit'] = merged.apply(lambda x: (x.audit_std_page in x.auditor) if pd.notna(x.audit_std_page) else False, axis=1)

In [135]:
def calc_acc(df, section):
    mask_True = df[f'hit_{section}'] == True
    hits = df[mask_True].count()
    mask_valid = df[f'hit_{section}'].isin([True, False])
    valid =df[mask_valid].count()
    return hits/valid

print('notes: ',calc_acc(merged, 'notes'))
print('auditor: ',calc_acc(merged, 'audit'))

notes:  doc_id            0.825137
doc_num_pages     0.825137
notes             0.825137
auditor           0.825137
audit_std_page    0.823204
notes_std_page    0.883041
done              0.825137
hit_notes         0.825137
hit_audit         0.825137
dtype: float64
auditor:  doc_id            0.803279
doc_num_pages     0.803279
notes             0.803279
auditor           0.803279
audit_std_page    0.812155
notes_std_page    0.812865
done              0.803279
hit_notes         0.803279
hit_audit         0.803279
dtype: float64


# Save Found Page Numbers

In [53]:
curr_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
result_df.to_parquet(os.path.join(path_output, f"page_nums_{curr_time}.parquet"), index=False)

# Extract Pages and prep text

## Load Meta Page File

In [54]:
file = "page_nums_2024_04_08_15_10.parquet"
sections = ['auditor', 'notes']
meta_df = pd.read_parquet(os.path.join(path_output, file))

In [55]:
meta_df['path_doc'] = meta_df.doc_id.apply(lambda x: os.path.join(path_raw_pdf, f"{x}.pdf"))

In [56]:
meta_ids = meta_df['doc_id'].values

In [59]:
results = {}
for index, row in tqdm(meta_df.iterrows()):
    path = row.path_doc
    id = row.doc_id
    for section in sections:
        sections_pages = row[section]
        key = tuple([id, section])
        results[key] = PageTextExtractor(id, path, section, sections_pages, True, processed_section_anchors[section], 20, 400).run()
        meta_df.loc[meta_df.doc_id == id,f"{section}_num_tokens"] = results[key][1]

183it [01:02,  2.95it/s]


In [60]:
cols_num_tokens = [col for col in meta_df.columns if "num_tokens" in col]
if 'total_num_tokens' in meta_df.columns:
    meta_df['total_num_tokens'] = 0
meta_df['total_num_tokens'] = meta_df[cols_num_tokens].sum(axis=1)

In [61]:
meta_df

Unnamed: 0,doc_id,doc_num_pages,notes,auditor,path_doc,auditor_num_tokens,notes_num_tokens,total_num_tokens
0,21175204,15,"[6, 12]",[],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,0.0,162.0,162.0
1,22399065,50,"[20, 21, 29, 32]",[25],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,235.0,151.0,386.0
2,22415158,48,"[8, 31]","[24, 25]",C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,433.0,216.0,649.0
3,22430067,51,"[7, 21]",[15],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,173.0,280.0,453.0
4,22430199,116,"[22, 50, 54, 62, 63, 64]","[55, 56]",C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,496.0,728.0,1224.0
...,...,...,...,...,...,...,...,...
178,64398711,57,"[41, 53]","[28, 29, 43, 44]",C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,1257.0,244.0,1501.0
179,64481834,215,"[66, 67, 68, 70, 71, 84, 87, 89, 153, 157, 158]",[155],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,200.0,1529.0,1729.0
180,64494221,174,"[96, 98, 111, 120]",[],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,0.0,647.0,647.0
181,64494258,169,"[69, 70, 71, 72, 79, 80, 81, 83, 84, 85, 86, 8...","[39, 68]",C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,481.0,3498.0,3979.0


In [62]:
meta_df.total_num_tokens.mean()

754.103825136612

In [72]:
merged

Unnamed: 0,doc_id,doc_num_pages,notes,auditor,audit_std_page,notes_std_page,hit_notes,hit_audit
0,21175204,15,"[6, 12]",[],2,6,True,False
1,22399065,50,"[20, 21, 29, 32]",[25],25,29,True,True
2,22415158,48,"[8, 31]","[24, 25]",24,31,True,True
3,22430067,51,"[7, 21]",[15],16,21,True,False
4,22430199,116,"[22, 50, 54, 62, 63, 64]","[55, 56]",55,62,True,True
...,...,...,...,...,...,...,...,...
178,64398711,57,"[41, 53]","[28, 29, 43, 44]",28,41,True,True
179,64481834,215,"[66, 67, 68, 70, 71, 84, 87, 89, 153, 157, 158]",[155],155,66,True,True
180,64494221,174,"[96, 98, 111, 120]",[],96,111,True,False
181,64494258,169,"[69, 70, 71, 72, 79, 80, 81, 83, 84, 85, 86, 8...","[39, 68]",68,79,True,True


In [136]:
detm = merged[~merged.hit_audit | ~merged.hit_notes].copy()
mask = ~((detm.audit_std_page.isna() & detm.hit_notes) | (detm.notes_std_page.isna() & detm.hit_audit))

In [137]:
detm[mask].reset_index(drop=True).to_csv(os.path.

Unnamed: 0,doc_id,doc_num_pages,notes,auditor,audit_std_page,notes_std_page,done,hit_notes,hit_audit
0,22430067,51,"[7, 21]",[15],16,21.0,Hala,True,False
1,23046354,108,"[30, 49, 50, 56, 64, 65, 67]","[57, 58]",55,64.0,Hala,True,False
2,23451950,63,"[3, 9, 38, 57]",[4],9,18.0,Hala,False,False
3,23625089,134,"[6, 57, 58, 65, 66, 77]",[67],67,74.0,Hala,False,True
4,60324060,77,"[27, 47]",[21],21,70.0,Hala,False,True
5,60358974,61,[33],[24],25,33.0,Hala,True,False
6,60359565,69,[36],[26],27,36.0,Hala,True,False
7,60359578,72,[38],"[29, 30]",27,38.0,Hala,True,False
8,60359654,69,[29],[21],23,29.0,Hala,True,False
9,60661954,66,"[9, 36]",[30],31,36.0,Ulvi,True,False


In [110]:
id = '22430067'
path = paths_pdf_files[id]
PageNumberExtractor(id, path, processed_section_anchors, 0.5, False, False, False).run()

notes
0 set()
re.compile('financi statement .*? prepar .*? with')
1 {2}
2 set()
3 set()
4 set()
5 set()
6 set()
re.compile('financi report standard')
re.compile('financi statement .*? prepar .*? with')
7 {1, 2}
8 set()
9 set()
10 set()
11 set()
12 set()
13 set()
14 set()
re.compile('signific account polici')
re.compile('financi statement .*? prepar .*? with')
15 {1, 2}
16 set()
['']
17 set()
18 set()
19 set()
20 set()
re.compile('basi of present')
re.compile('financi report standard')
re.compile('financi statement .*? prepar .*? with')
21 {0, 1, 2}
re.compile('signific account polici')
22 {1}
23 set()
24 set()
25 set()
26 set()
27 set()
28 set()
29 set()
30 set()
re.compile('financi .*? in accord with')
31 {2}
32 set()
33 set()
34 set()
35 set()
36 set()
37 set()
38 set()
39 set()
40 set()
41 set()
42 set()
43 set()
44 set()
45 set()
46 set()
47 set()
48 set()
49 set()
50 set()
auditor
0 set()
1 set()
2 set()
3 set()
4 set()
5 set()
6 set()
7 set()
8 set()
9 set()
10 set()
11 set()
12 

{'doc_id': '22430067', 'doc_num_pages': 51, 'notes': [7, 21], 'auditor': [15]}