# Imports

In [1]:
import re
import os
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
import time

import utility.utility as util
from utility.Extractor import PageNumberExtractor, PageTextExtractor, TermExtractor
import utility.text_cleaning as tc
import utility.extractor_meta as em
from datetime import datetime

# Progress Bars:
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# Below import and instructions simply for display
from IPython.display import display, HTML, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))

# resets import once changes have been applied
%load_ext autoreload
%autoreload 2

# Paths

In [131]:
path_cwd = os.getcwd()
path_data = os.path.join(path_cwd, 'data')
path_raw_pdf = os.path.join(path_data, 'raw_pdf_files')
path_input_meta = os.path.join(path_data, 'input_meta')
path_testing_meta = os.path.join(path_data, 'testing_meta')

# Load Meta

In [146]:
meta_input = pd.read_csv(os.path.join(path_input_meta, 'full_test.csv'))
meta_input

Unnamed: 0,doc_id,doc_path
0,21175204,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...
1,22399065,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...
2,22415158,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...
3,22430067,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...
4,22430199,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...
...,...,...
546,63471748,L:/pi_documents/documents_raw/pdf_navigator/63...
547,63688600,L:/pi_documents/documents_raw/pdf_navigator/63...
548,65265662,L:/pi_documents/documents_raw/acrobat_pdf_docu...
549,64651583,L:/pi_documents/documents_raw/pdf_navigator/64...


In [147]:
testing_meta = pd.read_csv(os.path.join(path_testing_meta, 'full_test_meta.csv'))
testing_meta

Unnamed: 0,doc_id,audit_std_page,notes_std_page,done
0,23032471,55,60,Hala
1,23451950,4,9,Hala
2,23855392,19,22,Ulvi
3,60297290,32,38,Ulvi
4,60490595,43,52,Ulvi
...,...,...,...,...
546,63471748,145,160,Ulvi
547,63688600,93,104,Ulvi
548,65265662,125,,Ulvi
549,64651583,144,152,Ulvi


# Test Page Number Extractor

In [158]:
section_anchors = {'notes': em._notes_sections, 'auditor': em._auditor_sections}
processed_section_anchors = util.process_section_anchors(section_anchors)

### Single Test

In [159]:
doc_id = 22399065
doc_path = meta_input.loc[meta_input.doc_id == doc_id, "doc_path"].values[0]

res = PageNumberExtractor(doc_id=doc_id,
                          path=doc_path,
                          section_anchors=processed_section_anchors,
                          min_anchor_hit_ratio=0.5,
                          flag_only_max_hits=False,
                          flag_allow_overlapping_sections=False,
                          flag_adjust_real_page_num=False,
                          flag_do_ocr=False,
                          flag_allow_duplicate_hits_in_groups=True,
                          sections_with_page_skip_groups=['auditor'],
                          thresh_ocr=100).run()

res

{'doc_id': 22399065,
 'doc_path': 'C:\\Users\\ilias\\Desktop\\UniMaResearch2023\\ExtractSectionsFinstmts\\data\\raw_pdf_files\\22399065.pdf',
 'doc_num_pages': 50,
 'notes': [18, 19, 21, 29, 30, 31, 32],
 'auditor': [25]}

### Loop Test

In [164]:
results = []
for index, row in tqdm(meta_input.iterrows()):
    doc_id = row.doc_id
    doc_path = row.doc_path

    res = PageNumberExtractor(doc_id=doc_id,
                          path=doc_path,
                          section_anchors=processed_section_anchors,
                          min_anchor_hit_ratio=0.5,
                          flag_only_max_hits=False,
                          flag_allow_overlapping_sections=False,
                          flag_adjust_real_page_num=False,
                          flag_do_ocr=False,
                          flag_allow_duplicate_hits_in_groups=True,
                          sections_with_page_skip_groups=['auditor'],
                          thresh_ocr=100).run()

    results.append(res)
result_df = pd.DataFrame(results)

10it [00:24,  2.48s/it]


In [166]:
result_df

Unnamed: 0,doc_id,doc_path,doc_num_pages,notes,auditor
0,21175204,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,15,"[6, 9, 12]",[2]
1,22399065,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,50,"[18, 19, 21, 29, 30, 31, 32]",[25]
2,22415158,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,48,"[8, 11, 12, 14, 31, 33, 23]","[24, 25]"
3,22430067,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,51,"[1, 6, 7, 21, 22, 31, 16]",[15]
4,22430199,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,116,"[22, 47, 49, 50, 62, 63, 64, 65, 66, 67, 68, 6...","[52, 53, 54, 55, 56]"
5,22434819,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,159,"[1, 7, 25, 26, 34, 37, 83, 84, 85, 86, 87, 88,...","[75, 76, 77]"
6,22439502,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,80,"[9, 14, 46, 51, 57, 76]","[38, 39, 40]"
7,22454940,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,36,"[22, 23, 24, 11, 12, 13]","[14, 15, 16, 17]"
8,22519409,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,89,"[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 2...",[4]
9,22630769,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,80,"[6, 33, 35, 36, 39, 50, 53, 54]","[42, 43, 44]"


## Accuracy Page Num Extractor

In [176]:
merged = pd.merge(result_df, testing_meta, on='doc_id', how='left')
merged['audit_std_page'] = pd.to_numeric(merged['audit_std_page'], errors='coerce').astype('Int64')
merged['notes_std_page'] = pd.to_numeric(merged['notes_std_page'], errors='coerce').astype('Int64')
merged['hit_notes'] = merged.apply(lambda x: (x.notes_std_page in x.notes) if pd.notna(x.notes_std_page) else False, axis=1)
merged['hit_audit'] = merged.apply(lambda x: (x.audit_std_page in x.auditor) if pd.notna(x.audit_std_page) else False, axis=1)

def calc_acc(df, section):
    df = df[[f'hit_{section}']].copy()
    mask_True = df[f'hit_{section}'] == True
    hits = df[mask_True].count()
    mask_valid = df[f'hit_{section}'].isin([True, False])
    valid =df[mask_valid].count()
    return hits/valid

print('notes: ',calc_acc(merged, 'notes'))
print('auditor: ',calc_acc(merged, 'audit'))

"""
curr_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
result_df.to_parquet(os.path.join(path_output, f"page_nums_{curr_time}.parquet"), index=False)
"""

notes:  hit_notes    1.0
dtype: float64
auditor:  hit_audit    0.909091
dtype: float64


### Missing Page Nums

In [204]:
merged[merged.audit_std_page.isna() | merged.notes_std_page.isna()]

Unnamed: 0,doc_id,doc_path,doc_num_pages,notes,auditor,audit_std_page,notes_std_page,done,hit_notes,hit_audit


### Wrong Page

In [203]:
merged[~merged.hit_audit | ~merged.hit_notes]

Unnamed: 0,doc_id,doc_path,doc_num_pages,notes,auditor,audit_std_page,notes_std_page,done,hit_notes,hit_audit
3,22430067,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,51,"[1, 6, 7, 21, 22, 31, 16]",[15],16,21,Hala,True,False


# Test Text Extractor

In [180]:
extraction_anchors = {'notes': em._notes_standards, 'auditor': em._auditor_standards}
processed_extraction_anchors = util.process_section_anchors(extraction_anchors)

In [182]:
meta_text_extraction = result_df.copy()

### Single Test

In [200]:
doc_id = 21175204
doc_path = meta_text_extraction.loc[meta_text_extraction.doc_id == doc_id, "doc_path"].values[0]

res = {}

for section, anchor in processed_extraction_anchors.items():
    section_pages = meta_text_extraction.loc[meta_text_extraction.doc_id == doc_id, section].values[0]
    res[section] = PageTextExtractor(doc_id = doc_id,
                                     path = doc_path,
                                     section = section,
                                     page_nums = section_pages,
                                     flag_reduce = False,
                                     anchors = anchor,
                                     anchor_add_word_window = 20,
                                     allowance_wildcards_reg_matches = 400,
                                     flag_do_ocr = False).run()

res

{'notes': ["CONIAGAS RESOURCES LIMITED NOTES TO CONSOLIDATED INTERIM FINANCIAL STATEMENTS December 31 2005 1. Significant accounting policies These consolidated financial statements have been prepared in accordance with accounting principles generally accepted in Canada. a Principles of consolidation These consolidated financial statements include the accounts of the Company and its subsidiary St. Barbera Resources Limited. b Mining properties and deferred exploration expenditures The Company records its mining properties at cost less amounts written down and capitalizes exploration expenditures until such time as they are depleted against production from the property to which they apply. If exploration is deemed unsuccessful if the mineral properties are abandoned or if other developments which negate development occur the applicable costs are written off. c Translation of foreign currency Monetary assets and liabilities are translated into Canadian dollars at the rate of exchange pre

### Loop Test

In [201]:
for index, row in tqdm(meta_text_extraction.iterrows()):
    doc_id = row.doc_id
    doc_path = row.doc_path
    for section, anchors in processed_extraction_anchors.items():
        section_pages = row[section]
        if f"{section}_terms" not in meta_text_extraction.columns:
            meta_text_extraction[f"{section}_terms"] = None
            meta_text_extraction[f"{section}_terms"] = meta_text_extraction[f"{section}_terms"].astype('object')

        res = PageTextExtractor(doc_id = doc_id,
                                path = doc_path,
                                section = section,
                                page_nums = section_pages,
                                flag_reduce = False,
                                anchors = anchors,
                                anchor_add_word_window = 20,
                                allowance_wildcards_reg_matches = 400,
                                flag_do_ocr = False).run()
        
        meta_text_extraction.loc[meta_text_extraction.doc_id == doc_id,f"{section}_num_tokens"] = res[1]
        meta_text_extraction.loc[meta_text_extraction.doc_id == doc_id,f"{section}_text"] = res[0]
        meta_text_extraction.at[index,f"{section}_terms"] = res[2]

# aggregate token estimate
cols_num_tokens = [col for col in meta_text_extraction.columns if "num_tokens" in col]
if 'total_num_tokens' in meta_text_extraction.columns:
    meta_text_extraction['total_num_tokens'] = 0
meta_text_extraction['total_num_tokens'] = meta_text_extraction[cols_num_tokens].sum(axis=1)

11it [00:02,  4.73it/s]


### Estimate Number of Tokens

In [202]:
meta_text_extraction[['auditor_num_tokens', 'notes_num_tokens', 'total_num_tokens']].describe()

Unnamed: 0,auditor_num_tokens,notes_num_tokens,total_num_tokens
count,11.0,11.0,11.0
mean,1378.363636,6685.727273,8064.090909
std,903.136565,3150.242533,3634.535334
min,260.0,1673.0,1933.0
25%,569.0,4468.5,5890.5
50%,1507.0,6322.0,7835.0
75%,1859.0,8907.0,9960.0
max,2924.0,11382.0,14306.0


# Test Term Extractor

In [198]:
extraction_anchors = {'notes': em._notes_standards, 'auditor': em._auditor_standards}
processed_extraction_anchors = util.process_section_anchors(extraction_anchors)

In [217]:
meta_term_extraction = result_df.copy()

## Single Test

In [207]:
doc_id = 21175204
doc_path = meta_term_extraction.loc[meta_term_extraction.doc_id == doc_id, "doc_path"].values[0]

res = {}

for section, anchors in processed_extraction_anchors.items():
    
    section_pages = meta_term_extraction.loc[meta_term_extraction.doc_id == doc_id][section].values[0]
    
    res[section] = TermExtractor(doc_id = doc_id,
                                 path = doc_path,
                                 section = section,
                                 page_nums = section_pages,
                                 anchors = anchors,
                                 anchor_add_word_window = 20,
                                 allowance_wildcards_reg_matches = 400,
                                 flag_capture_surrounding_sentences = True,
                                 surrounding_sentences_margin = 2,
                                 flag_do_ocr = False).run()

for section in res:
    print(section)
    print()
    print(res[section])
    print('---')

notes

({(0, 3): ['CONIAGAS RESOURCES LIMITED NOTES TO CONSOLIDATED INTERIM FINANCIAL STATEMENTS December 31 2005 1.', 'Significant accounting policies These consolidated financial statements have been prepared in accordance with accounting principles generally accepted in Canada.', 'a Principles of consolidation These consolidated financial statements include the accounts of the Company and its subsidiary St. Barbera Resources Limited.', 'b Mining properties and deferred exploration expenditures The Company records its mining properties at cost less amounts written down and capitalizes exploration expenditures until such time as they are depleted against production from the property to which they apply.'], (14, 18): ['Included in other investments are 500000 common shares of CEDOR COQ formerly Esplau Inc. MSE which are held in trust by Fiducie Desjardins Inc. as part of a compensating reserve against a possible future claim by third parties against CEDOR.', 'That company through its s

In [212]:
type(res['auditor'][0])

dict

## Loop Test

In [218]:
for index, row in tqdm(meta_term_extraction.iterrows()):
    doc_id = row.doc_id
    doc_path = row.doc_path
    for section, anchors in processed_extraction_anchors.items():
        section_pages = row[section]
        if f"{section}_terms" not in meta_term_extraction.columns:
            meta_term_extraction[f"{section}_terms"] = None
            meta_term_extraction[f"{section}_terms"] = meta_term_extraction[f"{section}_terms"].astype('object')
        if f"{section}_sentences" not in meta_term_extraction.columns:
            meta_term_extraction[f"{section}_sentences"] = None
            meta_term_extraction[f"{section}_sentences"] = meta_term_extraction[f"{section}_sentences"].astype('object')
        if f"{section}_paragraphs" not in meta_term_extraction.columns:
            meta_term_extraction[f"{section}_paragraphs"] = None
            meta_term_extraction[f"{section}_paragraphs"] = meta_term_extraction[f"{section}_paragraphs"].astype('object')

        res = TermExtractor(doc_id = doc_id,
                            path = doc_path,
                            section = section,
                            page_nums = section_pages,
                            anchors = anchors,
                            anchor_add_word_window = 20,
                            allowance_wildcards_reg_matches = 400,
                            flag_capture_surrounding_sentences = True,
                            surrounding_sentences_margin = 2,
                            flag_do_ocr = False).run()

        paragraphs = ""
        for inter, para in res[0].items():
            paragraphs += ' '.join(para) + '\n\n'

        num_tokens = util.count_tokens(paragraphs)            
        
        meta_term_extraction.loc[meta_term_extraction.doc_id == doc_id,f"{section}_num_tokens"] = num_tokens
        meta_term_extraction.loc[meta_term_extraction.doc_id == doc_id,f"{section}_paragraphs"] = paragraphs
        meta_term_extraction.at[index,f"{section}_sentences"] = res[1]
        meta_term_extraction.at[index,f"{section}_terms"] = res[2]

# aggregate token estimate
cols_num_tokens = [col for col in meta_term_extraction.columns if "num_tokens" in col]
if 'total_num_tokens' in meta_term_extraction.columns:
    meta_term_extraction['total_num_tokens'] = 0
meta_term_extraction['total_num_tokens'] = meta_term_extraction[cols_num_tokens].sum(axis=1)

11it [00:03,  3.53it/s]


In [219]:
meta_term_extraction

Unnamed: 0,doc_id,doc_path,doc_num_pages,notes,auditor,notes_terms,notes_sentences,notes_paragraphs,notes_num_tokens,auditor_terms,auditor_sentences,auditor_paragraphs,auditor_num_tokens,total_num_tokens
0,21175204,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,15,"[6, 9, 12]",[2],{1: [' accounting principles generally accepte...,{1: 'Significant accounting policies These con...,CONIAGAS RESOURCES LIMITED NOTES TO CONSOLIDAT...,431.0,{7: [' generally accepted accounting principle...,{7: 'In my opinion these consolidated financia...,An audit includes examining on a test basis ev...,96.0,527.0
1,22399065,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,50,"[18, 19, 21, 29, 30, 31, 32]",[25],"{25: [' Canadian GAAP.'], 51: [' generally acc...",{25: 'Furthermore internal controls have been ...,The Company received conditional acceptance fo...,292.0,{11: [' Canadian generally accepted accounting...,{11: 'Opinion In our opinion the consolidated ...,An audit also includes evaluating the appropri...,181.0,473.0
2,22415158,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,48,"[8, 11, 12, 14, 31, 33, 23]","[24, 25]","{116: [' U.S. GAAP.'], 119: [' generally accep...",{116: 'We have also designed internal controls...,As of the end of the period covered by this re...,933.0,{10: [' generally accepted accounting principl...,{10: 'Independent Auditors Report of Registere...,An audit also includes evaluating the appropri...,243.0,1176.0
3,22430067,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,51,"[1, 6, 7, 21, 22, 31, 16]",[15],{3: [' Canadian generally accepted accounting ...,{3: 'The financial information in this MDA is ...,Formerly ATACAMA MINERALS CORP. MANAGEMENTS DI...,710.0,{},{},,0.0,710.0
4,22430199,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,116,"[22, 47, 49, 50, 62, 63, 64, 65, 66, 67, 68, 6...","[52, 53, 54, 55, 56]",{6: [' IFRS and IFRS 1 FirstTime Adoption of I...,{6: 'The consolidated financial statements for...,You should also read our audited consolidated ...,628.0,{58: [' International Financial Reporting Stan...,{58: 'Opinion In our opinion the consolidated ...,An audit also includes evaluating the appropri...,321.0,949.0
5,22434819,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,159,"[1, 7, 25, 26, 34, 37, 83, 84, 85, 86, 87, 88,...","[75, 76, 77]",{8: [' Canadian Generally Accepted Accounting ...,{8: 'Reconciliation The Consolidated Financial...,YES X NO Indicate by check mark whether the Re...,1209.0,{39: [' Canadian generally accepted accounting...,{39: 'Opinion In our opinion the consolidated ...,An audit also includes evaluating the appropri...,168.0,1377.0
6,22439502,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,80,"[9, 14, 46, 51, 57, 76]","[38, 39, 40]",{41: [' IFRS and International Financial Repor...,{41: 'The consolidated financial statements of...,BASIS OF PRESENTATION AND ADOPTION OF INTERNAT...,246.0,{},{},,0.0,246.0
7,22454940,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,36,"[22, 23, 24, 11, 12, 13]","[14, 15, 16, 17]",{78: [' Canadian generally accepted accounting...,{78: 'Significant accounting policies The cons...,21 INTERNATIONAL ROAD DYNAMICS INC. Notes to C...,165.0,{73: [' Canadian generally accepted accounting...,{73: 'Opinion In our opinion the consolidated ...,An audit also includes evaluating the appropri...,126.0,291.0
8,22519409,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,89,"[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 2...",[4],{15: [' IFRS as issued by the International Ac...,{15: 'Significant Accounting Policies Statemen...,The subsidiary is a specialty compound fertili...,256.0,{10: [' International Financial Reporting Stan...,{10: 'Opinion In our opinion these consolidate...,An audit also includes evaluating the appropri...,145.0,401.0
9,22630769,C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,80,"[6, 33, 35, 36, 39, 50, 53, 54]","[42, 43, 44]",{104: [' International Financial Reporting Sta...,{104: 'Significant accounting policies a State...,The Company is the worlds largest supplier of ...,171.0,{21: [' International Financial Reporting Stan...,{21: 'In our opinion the consolidated financia...,An audit also includes assessing the accountin...,277.0,448.0


In [221]:
meta_term_extraction[['auditor_num_tokens', 'notes_num_tokens', 'total_num_tokens']].describe()

Unnamed: 0,auditor_num_tokens,notes_num_tokens,total_num_tokens
count,11.0,11.0,11.0
mean,180.272727,504.0,684.272727
std,130.351901,338.087267,374.76475
min,0.0,165.0,246.0
25%,111.0,251.0,424.5
50%,168.0,431.0,527.0
75%,260.0,669.0,939.0
max,426.0,1209.0,1377.0
