# Imports

In [326]:
import re
import os
import pandas as pd
from pprint import pprint
import time

import utility.utility as util
from utility.Extractor import PageNumberExtractor, PageTextExtractor
import utility.text_cleaning as tc
import utility.extractor_meta as em
from datetime import datetime

# Progress Bars:
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# Below import and instructions simply for display
from IPython.display import display, HTML, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))

# resets import once changes have been applied
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Paths

In [327]:
path_cwd = os.getcwd()
path_data = os.path.join(path_cwd, 'data')
path_raw_pdf = os.path.join(path_data, 'raw_pdf_files')
path_raw_txt = os.path.join(path_data, 'raw_text_files')
path_output = os.path.join(path_cwd, 'output')

## File paths

In [328]:
paths_pdf_files = {file.split('.')[0]: os.path.join(path_raw_pdf, file) for file in os.listdir(path_raw_pdf) if file != '.gitkeep'}
ids = list(paths_pdf_files.keys())

# Prepare Sections Anchors

In [329]:
section_anchors = {'notes': em._notes_sections, 'auditor': em._auditor_sections}
processed_section_anchors = util.process_section_anchors(section_anchors)

# Create Extractors

In [330]:
page_num_extractors = {k: PageNumberExtractor(k, v, processed_section_anchors, 0.5, False, False, False) for k, v in tqdm(paths_pdf_files.items())}

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<?, ?it/s]


In [331]:
results = []
for id in tqdm(page_num_extractors):
    e = page_num_extractors[id]
    results.append(e.run())
    del e
result_df = pd.DataFrame(results)

100%|███████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:47<00:00,  2.79s/it]


In [332]:
result_df

Unnamed: 0,doc_id,doc_num_pages,notes,auditor
0,200018069,29,[],[10]
1,22032523,132,"[18, 50, 74, 81, 127]","[75, 76]"
2,60665866,221,"[156, 200]",[]
3,60784900,92,"[25, 26, 43, 44, 45, 46, 51, 52, 81]",[85]
4,60816306,176,"[33, 94, 95, 96, 97, 150, 151, 165]","[161, 162, 170]"
5,61029873,212,"[1, 3, 111, 190, 191]",[201]
6,61178281,130,"[23, 40, 50, 51]","[42, 43]"
7,61466756,80,[24],[19]
8,61635511,489,"[34, 252, 297, 305, 315, 425]",[307]
9,61681491,256,"[3, 98, 156, 166, 168]",[159]


# Save Found Page Numbers

In [492]:
curr_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
result_df.to_parquet(os.path.join(path_output, f"page_nums_{curr_time}.parquet"), index=False)

# Extract Pages and prep text

## Load Meta Page File

In [533]:
file = "page_nums_2024_04_07_03_31.parquet"
sections = ['auditor', 'notes']
meta_df = pd.read_parquet(os.path.join(path_output, file))

In [534]:
meta_df['path_doc'] = meta_df.doc_id.apply(lambda x: os.path.join(path_raw_pdf, f"{x}.pdf"))

In [535]:
meta_ids = meta_df['doc_id'].values

In [540]:
results = {}
for index, row in tqdm(meta_df.iterrows()):
    path = row.path_doc
    id = row.doc_id
    for section in sections:
        print(section)
        sections_pages = row[section]
        key = tuple([id, section])
        results[key] = PageTextExtractor(id, path, section, sections_pages, True, processed_section_anchors[section], 20, 400).run()
        meta_df.loc[meta_df.doc_id == id,f"{section}_num_tokens"] = results[key][1]
    print('-----')

1it [00:00,  8.59it/s]

auditor
INDEPENDENT AUDITORS REPORT 9 To the shareholder of BHJ AS Opinion We have audited the financial statements of BHJ AS for the financial year January – which comprise income statement balance sheet statement of changes
In our opinion the financial statements give a true and fair view of the financial position of the Company at and of the results of the Company's operations for the financial year January – in accordance with the Danish Financial Statements Act. Basis for opinion We conducted our audit in accordance with International Standards on Auditing ISAs and additional requirements applicable in Denmark. Our responsibilities under those standards and requirements are further described in the Auditor's responsibilities for the audit of the financial statements section of our report. We believe that the audit evidence we have obtained is sufficient and appropriate to provide a basis for our opinion. Independence We are independent of the Company in accordance with the Interna

3it [00:00,  3.62it/s]

prepared our consolidated financial statements in accordance with Canadian GAAP and provided reconciliations to United States US GAAP. In the Canadian Accounting Standards Board AcSB published a new
Financial Reporting Standards IFRS over an expected fiveyear transitional period. In February the AcSB confirmed that IFRS would be mandatory in Canada for
financial statements. Our audited consolidated financial statements for the year ended March including related notes and this MDA have therefore been prepared based on US GAAP. All comparative figures contained in these documents have been restated to reflect our results as if they had been historically reported in accordance with US GAAP as our reporting standard. All financial statements and MDAs previously filed were prepared under Canadian GAAP as our reporting standard. As required by National Instrument for the fiscal year of adoption of US GAAP and one subsequent fiscal year we will provide a Canadian Supplement to our MDA that re

4it [00:01,  2.69it/s]

Significant accounting policies Accounting estimates and judgements Segment reporting Other income Significant expenses Unusual items Auditors remuneration Net financing costs Income tax expense
SIGNIFICANT ACCOUNTING POLICIES Campbell Brothers Limited the Company is a company domiciled in Australia. The consolidated financial report of the Company for the year ended comprises the Company and its subsidiaries together referred to as the consolidated entity and the consolidated entitys interest in associates and jointly controlled entities. The financial report was authorised for issue by the directors on a Statement of compliance The financial report is a general purpose financial report which has been prepared in accordance with Australian Accounting Standards AASBs adopted by the Australian Accounting Standards Board AASB and the Corporations Act International Financial Reporting Standards IFRSs form the basis of Australian Accounting Standards AASBs adopted by the AASB and for the p

5it [00:02,  1.60it/s]

statement of compliance with the relevant areas of the Code of Conduct every six months. People Brambles employment policies commit the organisation to
Basis of preparation These financial statements present the consolidated results of Brambles Industries Limited ABN BIL and Brambles Industries plc registered number BIP.
Significant accounting policies BASIS OF ACCOUNTING These financial statements are a general purpose financial report. The financial statements have been prepared in accordance with International Financial Reporting Standards as adopted for use in the European Union IFRS and Australian Equivalents to International Financial Reporting Standards AIFRS and in accordance with the requirements of the Corporations Act and with those parts of the Companies Act applicable to companies reporting under IFRS.
financial statements reference is made to IFRS which should be read to include AIFRS. Accounting policies have been selected to ensure concurrent compliance with both IFRS a

6it [00:02,  1.78it/s]

Financial liabilities in accordance with consolidated balance sheet Net debt a c Additions to intangible assets including goodwill and property plant and equipment Cash flows
Financial Reporting Standards IFRS. They should not be viewed in isolation as an alternative to profit or loss from operations net profit or loss net cash from operating activities the net debt reported in the consolidated balance sheet or other Deutsche Telekom key performance indicators presented in accordance with IFRS. For detailed information and calculations please refer to the section on Development of business in the Group of the
financial year Deutsche Telekom has defined free cash flow as cash generated from operations less interest paid and net cash outflows for investments in intangible assets excluding goodwill and property plant and equipment. Prioryear figures have been adjusted accordingly. h Calculation of basic and diluted earnings per share in accordance with IFRS as specified in IAS Earnings pe

8it [00:03,  2.60it/s]

BASIS OF PREPARATION The consolidated financial statements of the Group have been prepared on a historical cost basis except for derivative financial instruments which are measured at fair value. The consolidated financial statements are presented in Philippine peso which is the Parent Companys functional currency under Philippine Financial Reporting Standards PFRS. All values are rounded to the nearest except when otherwise indicated. Statement of Compliance The consolidated financial statements have been prepared in compliance with PFRS. PFRS includes statements named PFRS and Philippine Accounting Standards PAS including Philippine interpretations from International Financial Reporting Interpretation Committee
Financial Reporting Standards Council FRSC. GINEBRA SANMIGUEL SINCE STATEM ENT OF MANAGEMENT'S RESPONSIBILITY FOR THE CONSO LIDATED FI NANCIAL STATEM EN TS The
financial statements for the years ended December and The conso lidated financia l statements have been prepared in c

9it [00:04,  2.10it/s]

Financial Reporting Standards As required by the Companies Act and Article of the European Union IAS Regulation the consolidated financial statements of the Group are prepared in accordance with International Financial Reporting Standards issued by the International Accounting Standards Board IASB and interpretations issued by the IFRS Interpretations Committee of the IASB as
statement of compliance with the UK Corporate Governance Code issued by the Financial Reporting Council in May the Code is set out on
Statement of compliance The company is committed to high standards of corporate governance business integrity and professionalism in all its activities. Throughout the
preparation of the Annual Report and Accounts. The directors are required by Article of the IAS Regulation European Commission Regulation No to prepare Group accounts and as permitted by the Companies Act have elected to prepare company accounts for each financial year in accordance with International Financial Report

10it [00:04,  2.30it/s]

Financial Reporting Standards IFRS the Carbon Disclosure Project CDP the international chemical industrys Responsible Care™ programme as well as AECIs own internal reporting
Financial Statements of the Company and its subsidiaries and for reporting their opinion on these Statements to shareholders. Furthermore the external auditors must determine whether the Annual Financial Statements are in accordance with the Act IFRS and the JSE Listings Requirements. Any necessary restatements of ﬁnancial information are identiﬁed and explained in the
Financial Statements. This Report for the ﬁnancial year ended is AECIs ﬁrst Integrated Report to stakeholders in accordance with the principles contained in the King Report on Governance for South Africa King III. The Company notes that its adoption
Financial Statements of the Company and its subsidiaries and for reporting their opinion on these Statements to shareholders. Furthermore the external auditors must determine whether the Annual Financial 

11it [00:05,  1.89it/s]

REPORT OF INDEPENDENT AUDITORS The Board of Directors and Stockholders Acesite Phils. Hotel Corporation and Subsidiary Room Manila Pavilion Hotel United Nations Avenue Ermita
We have audited the accompanying consolidated financial statements of Acesite Phi Is. Hotel Corporation and Subsidiary which comprise the consolidated statements of financial position
Management's Responsibility for the Consolidated Financial Statements Management is responsible for the preparation and fair presentation of these consolidated financial statements in accordance with Philippine Financial Reporting Standards and for such internal control as management determines
Our responsibility is to express an opinion on these consolidated financial statements based on our audits. We conducted our audits in accordance with Philippine Standards on Auditing. Those standards require that we comply with ethical requirements and plan and perform the audit to obtain reasonable assurance about whether the consolidated fi

12it [00:05,  2.01it/s]

financial statements. There have been no significant changes in the nature of these activities during the financial year. The consolidated financial statements of the Group and statement of financial position and statement of changes in equity of the Company for the financial year ended were authorised for issue by Board of Directors on 2. Summary of significant accounting policies Basis of preparation The financial statements have been drawn up in accordance with the provisions of the Singapore Companies Act Cap. and the Singapore Financial Reporting Standards FRS including related Interpretations of FRS INT FRS and are prepared on historical cost basis except as disclosed in the
financial statements of each Group entity are measured and presented in the currency of the primary economic environment in which the entity operates its functional currency. The consolidated financial statements of the Group and the statement of financial position of the Company are presented in Singapore do

14it [00:06,  2.26it/s]

STATEMENT OF COMPLIANCE WITH CORPORATE GOVERNANCE PRINCIPLES Turcas Petrol the Company is very keen on applying Corporate Governance Principles set by the Capital
2BASIS OF PRESENTATION OF CONSOLIDATED FINANCIAL STATEMENTS Basis of presentation Financial reporting standards The accompanying consolidated financial statements are prepared in accordance with Communiqué Serial II No14 Principles of Financial Reporting in Capital Markets the Communiqué published in the Official Gazette numbered on According to Article of the Communiqué consolidated financial statements are prepared in accordance with the Turkish Accounting Standards issued by Public Oversight Accounting and Auditing Standards Authority POAASA. TAS contains Turkish Accounting Standards Turkish
Financial Reporting Standards TFRS and its addendum and interpretations IFRIC. The financial statements of the consolidated financial statements of the Group are prepared as per the CMB announcement of relating to financial statements 

15it [00:07,  1.76it/s]

Financial Statements The independently audited comparative consolidated financial statements in comparison with previous periods financials prepared for the accounting period of in compliance with the Turkish Accounting Standards and the Turkish Financial Reporting Standards TAS and TFRS published by the Public Oversight Authority and the Turkish Accounting and Auditing Standards Board within the scope
financial statements. Limited within the scope of the information we have and we have been given our opinion relating to this financial statements were presented to the executives who have responsibility in the preparation of the financial statements. Within the framework of this opinion we have reached a conclusion that this financial statements truly reflects the facts regarding the
financial statements is as follows PK. Aliağa İzmir Basis of presentation of consolidated financial statements Basis of presentation The consolidated financial statements and disclosures have been prepared 

16it [00:07,  2.03it/s]

basis of accounting standard and guidance note issued by Institute of Chartered Accountant of India for this sector. iv. Risks and Concerns The
Significant Accounting Policies I Accounting Convention The financial statements are prepared under the historical cost convention in accordance with generally accepted accounting principles in India the Accounting Standards issued by The Institute of Chartered Accountants of India and the
Significant Accounting Policies I Accounting Convention The financial statements are prepared under the historical cost convention in accordance with generally accepted accounting principles in India the Accounting Standards issued by The Institute of Chartered Accountants of India and the
-----
auditor
INDEPENDENT AUDITORS REPORT To the Members of SUMMIT SECURITIES LIMITED Report on the Standalone Financial Statements Opinion We have audited the accompanying standalone financial statements of Summit Securities Limited the Company which comprise the Balance S

17it [00:08,  1.96it/s]

financial statements. As part of an audit in accordance with SAs we exercise professional judgment and maintain professional skepticism throughout the audit. We also • Identify and assess the risks
basis of accounting and based on the audit evidence obtained whether a material uncertainty exists related to events or conditions that may cast
Significant Accounting Policies Corporate Information Summit Securities Limited the Company is registered as NonBanking Financial Company us 45IA of Reserve Bank of India
Basis of Preparation These financial statements have been prepared to comply with Generally Accepted Accounting Principles in India Indian GAAP the Accounting Standards notified under the relevant provisions of the Companies Act
financial statements are prepared on accrual basis under the historical cost convention. The financial statements are presented in Indian rupees rounded off to the nearest rupees in Lakhs. Use of Estimates The preparation of financial statements in conformi




In [537]:
cols_num_tokens = [col for col in meta_df.columns if "num_tokens" in col]
if 'total_num_tokens' in meta_df.columns:
    meta_df['total_num_tokens'] = 0
meta_df['total_num_tokens'] = meta_df[cols_num_tokens].sum(axis=1)

In [538]:
meta_df

Unnamed: 0,doc_id,doc_num_pages,notes,auditor,path_doc,auditor_num_tokens,notes_num_tokens,total_num_tokens
0,200018069,29,[],[10],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,271.0,0.0,271.0
1,22032523,132,"[18, 50, 74, 81, 127]","[75, 76]",C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,320.0,653.0,973.0
2,60665866,221,"[156, 200]",[],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,0.0,155.0,155.0
3,60784900,92,"[25, 26, 43, 44, 45, 46, 51, 52, 81]",[85],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,232.0,1083.0,1315.0
4,60816306,176,"[33, 94, 95, 96, 97, 150, 151, 165]","[161, 162, 170]",C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,430.0,838.0,1268.0
5,61029873,212,"[1, 3, 111, 190, 191]",[201],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,281.0,443.0,724.0
6,61178281,130,"[23, 40, 50, 51]","[42, 43]",C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,287.0,581.0,868.0
7,61466756,80,[24],[19],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,328.0,61.0,389.0
8,61635511,489,"[34, 252, 297, 305, 315, 425]",[307],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,339.0,646.0,985.0
9,61681491,256,"[3, 98, 156, 166, 168]",[159],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...,248.0,559.0,807.0


In [539]:
meta_df.total_num_tokens.mean()

977.4705882352941