# Imports

In [146]:
import re
import os
import pandas as pd
from pprint import pprint
import time

import utility.utility as util
from utility.Extractor import PageNumberExtractor, PageTextExtractor
import utility.text_cleaning as tc
import utility.extractor_meta as em
from datetime import datetime

# Progress Bars:
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# Below import and instructions simply for display
from IPython.display import display, HTML, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))

# resets import once changes have been applied
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Paths

In [174]:
path_cwd = os.getcwd()
path_data = os.path.join(path_cwd, 'data')
path_raw_pdf = os.path.join(path_data, 'raw_pdf_files')
path_raw_txt = os.path.join(path_data, 'raw_text_files')
path_output = os.path.join(path_cwd, 'output')

## File paths

In [175]:
paths_pdf_files = {file.split('.')[0]: os.path.join(path_raw_pdf, file) for file in os.listdir(path_raw_pdf) if file != '.gitkeep'}
ids = list(paths_pdf_files.keys())

# Prepare Sections Anchors

In [305]:
section_anchors = {'notes': em._notes_sections, 'auditor': em._auditor_sections}
processed_section_anchors = util.process_section_anchors(section_anchors)

# Create Extractors

In [308]:
page_num_extractors = {k: PageNumberExtractor(doc_id = k,
                                              path = v,
                                              section_anchors = processed_section_anchors,
                                              min_anchor_hit_ratio = 0.5, 
                                              flag_only_max_hits = False,
                                              flag_allow_overlapping_sections = False,                                              
                                              flag_adjust_real_page_num = False,
                                              flag_do_ocr = False,
                                              flag_allow_duplicate_hits_in_groups = True,
                                              sections_with_page_skip_groups = ['auditor'],
                                              thresh_ocr =100) for k, v in tqdm(paths_pdf_files.items())}

100%|█████████████████████████████████████████████████████████████████████████████| 183/183 [00:00<00:00, 71814.90it/s]


In [309]:
results = []
for id in tqdm(page_num_extractors):
    e = page_num_extractors[id]
    results.append(e.run())
    del e
result_df = pd.DataFrame(results)

100%|████████████████████████████████████████████████████████████████████████████████| 183/183 [15:30<00:00,  5.08s/it]


In [342]:
test_df = pd.read_excel(os.path.join(path_data, 'testing_meta', 'sample_8april3.xlsx'))
test_df = test_df['doc_id audit_std_page notes_std_page done'.split()].copy()
test_df['audit_std_page'] = pd.to_numeric(test_df['audit_std_page'], errors='coerce').astype('Int64') - 1 
test_df['notes_std_page'] = pd.to_numeric(test_df['notes_std_page'], errors='coerce').astype('Int64') - 1
test_df['doc_id'] = test_df['doc_id'].astype(str)
test_df2 = pd.read_excel(os.path.join(path_data, 'testing_meta', 'fix_Sara_9_43.xlsx'))
test_df2 = test_df2['doc_id audit_std_page notes_std_page'.split()].copy()
test_df2['audit_std_page'] = pd.to_numeric(test_df2['audit_std_page'], errors='coerce').astype('Int64') - 1 
test_df2['notes_std_page'] = pd.to_numeric(test_df2['notes_std_page'], errors='coerce').astype('Int64') - 1
test_df2['doc_id'] = test_df2['doc_id'].astype(str)

for index, row in test_df2.iterrows():
    test_df.loc[test_df.doc_id == row.doc_id, 'audit_std_page'] = row.audit_std_page
    test_df.loc[test_df.doc_id == row.doc_id, 'notes_std_page'] = row.notes_std_page

In [343]:
merged = pd.merge(result_df, test_df, on='doc_id', how='left')


In [344]:
merged['hit_notes'] = merged.apply(lambda x: (x.notes_std_page in x.notes) if pd.notna(x.notes_std_page) else False, axis=1)
merged['hit_audit'] = merged.apply(lambda x: (x.audit_std_page in x.auditor) if pd.notna(x.audit_std_page) else False, axis=1)

In [345]:
def calc_acc(df, section):
    df = df[[f'hit_{section}']].copy()
    mask_True = df[f'hit_{section}'] == True
    hits = df[mask_True].count()
    mask_valid = df[f'hit_{section}'].isin([True, False])
    valid =df[mask_valid].count()
    return hits/valid

print('notes: ',calc_acc(merged, 'notes'))
print('auditor: ',calc_acc(merged, 'audit'))

notes:  hit_notes    1.0
dtype: float64
auditor:  hit_audit    0.994536
dtype: float64


# Save Found Page Numbers

In [349]:
curr_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
result_df.to_parquet(os.path.join(path_output, f"page_nums_{curr_time}.parquet"), index=False)

# Extract Pages and prep text

## Load Meta Page File

In [351]:
file = "page_nums_2024_04_11_00_41.parquet"
sections = ['auditor', 'notes']
meta_df = pd.read_parquet(os.path.join(path_output, file))

In [352]:
meta_df['path_doc'] = meta_df.doc_id.apply(lambda x: os.path.join(path_raw_pdf, f"{x}.pdf"))

In [353]:
meta_ids = meta_df['doc_id'].values

In [363]:
extraction_anchors = {'notes': em._notes_standards, 'auditor': em._auditor_standards}
processed_extraction_anchors = util.process_section_anchors(extraction_anchors)

In [427]:
results = {}
for index, row in tqdm(meta_df.iterrows()):
    path = row.path_doc
    id = row.doc_id
    for section in sections:
        if f"{section}_terms" not in meta_df.columns:
            meta_df[f"{section}_terms"] = None
            meta_df[f"{section}_terms"] = meta_df[f"{section}_terms"].astype('object')
        sections_pages = row[section]
        key = tuple([id, section])
        results[key] = PageTextExtractor(id, path, section, sections_pages, True, processed_extraction_anchors[section], 5, 400).run()
        meta_df.loc[meta_df.doc_id == id,f"{section}_num_tokens"] = results[key][1]
        meta_df.loc[meta_df.doc_id == id,f"{section}_text"] = results[key][0]
        meta_df.at[index,f"{section}_terms"] = results[key][2]

183it [00:50,  3.63it/s]


In [425]:
cols_num_tokens = [col for col in meta_df.columns if "num_tokens" in col]
if 'total_num_tokens' in meta_df.columns:
    meta_df['total_num_tokens'] = 0
meta_df['total_num_tokens'] = meta_df[cols_num_tokens].sum(axis=1)

In [367]:
meta_df.total_num_tokens.describe()

count     183.000000
mean      636.360656
std       356.437672
min       143.000000
25%       359.000000
50%       556.000000
75%       816.500000
max      2150.000000
Name: total_num_tokens, dtype: float64

In [428]:
meta_df.columns

Index(['doc_id', 'doc_num_pages', 'notes', 'auditor', 'path_doc',
       'auditor_terms', 'auditor_num_tokens', 'auditor_text', 'notes_terms',
       'notes_num_tokens', 'notes_text'],
      dtype='object')

In [437]:
for index, value in meta_df.loc[5, ['auditor_text', 'auditor_terms', 'notes_text', 'notes_terms']].items():
    print(index)
    print(value)
    print()

auditor_text
in the circumstances but not for the purpose of expressing an opinion on the effectiveness of the entitys internal control. An audit also includes evaluating the appropriateness of accounting policies used and the reasonableness of accounting estimates made by management as well as evaluating the overall presentation of the consolidated financial statements. We believe that the audit evidence we have obtained in our audits is sufficient and appropriate to provide a basis for our audit opinion. Opinion In our opinion the consolidated financial statements present fairly in all material respects the financial position of Envoy Capital Group Inc. as at September and and the results of its operations and its cash flows for the three years ended September in accordance with Canadian generally accepted accounting principles.

auditor_terms
['Canadian generally accepted accounting principles.']

notes_text
financial statements included in this filing U.S. GAAP International Financ

In [433]:
type(meta_df.loc[5, ['auditor_text', 'auditor_terms', 'notes_text', 'notes_terms']])

pandas.core.series.Series

# Testing Area

In [346]:
detm = merged[~merged.hit_audit | ~merged.hit_notes].copy()
mask = ~((detm.audit_std_page.isna() & detm.hit_notes) | (detm.notes_std_page.isna() & detm.hit_audit))

In [347]:
detm[mask].reset_index(drop=True)

Unnamed: 0,doc_id,doc_num_pages,notes,auditor,audit_std_page,notes_std_page,done,hit_notes,hit_audit
0,22430067,51,"[1, 6, 7, 21, 22, 31, 16]",[15],16,21,Hala,True,False


In [348]:
id = '22430067'
path = paths_pdf_files[id]
ex = PageNumberExtractor(doc_id = id,
                         path = path,
                         section_anchors = processed_section_anchors,
                         min_anchor_hit_ratio = 0.5, 
                         flag_only_max_hits = False,
                         flag_allow_overlapping_sections = False,  
                         flag_adjust_real_page_num = False,
                         flag_do_ocr = False,
                         flag_allow_duplicate_hits_in_groups = True,
                         sections_with_page_skip_groups = ['auditor'],
                         thresh_ocr =100)
result = ex.run()

notes
0 set()
re.compile('account principl')
re.compile('financi statement .*? prepar .*? with')
1 {0, 2}
2 set()
3 set()
re.compile('account principl')
4 {0}
5 set()
re.compile('account polici')
6 {0}
re.compile('account polici')
re.compile('financi report standard')
re.compile('financi statement .*? prepar .*? with')
7 {0, 1, 2}
8 set()
9 set()
10 set()
11 set()
12 set()
13 set()
14 set()
re.compile('account polici')
re.compile('signific account polici')
re.compile('financi statement .*? prepar .*? with')
15 {0, 1, 2}
['pwc independ auditor report to the sharehold of sirocco mine inc. we have audit the accompani consolid financi statement of sirocco mine inc. formerli known a atacama miner corp. and it subsidiari which compris the consolid balanc sheet a at decemb 31 2011 decemb 3 1 2010 and januari 1 2010 and the consolid statement of loss and comprehens loss cash flow and chang in equiti for the year end decemb 31 2011 and decemb 31 2010 and the relat note which compris a summari o

In [299]:
result

{'doc_id': '63606537',
 'doc_num_pages': 332,
 'notes': [51,
  54,
  65,
  128,
  137,
  148,
  149,
  151,
  152,
  154,
  173,
  181,
  182,
  185,
  187,
  196,
  197,
  205,
  206,
  207,
  210,
  213,
  214,
  215,
  217,
  218,
  219,
  220,
  221,
  273,
  274,
  275,
  276,
  277,
  278,
  279,
  191],
 'auditor': [192]}

In [341]:
merged[merged.audit_std_page.isna() | merged.notes_std_page.isna()]

Unnamed: 0,doc_id,doc_num_pages,notes,auditor,audit_std_page,notes_std_page,done,hit_notes,hit_audit


In [286]:
processed_section_anchors

{'notes': [('basi of prepar',
   'basi of present',
   'basi of account',
   'statement of complianc',
   'account convent',
   'present of account',
   'base of prepar',
   'account polici',
   'account principl'),
  ('summari of account polici',
   'statement of complianc',
   'signific account polici',
   'financi report standard',
   'basi of report',
   'present of ... financi statement',
   'appli rule'),
  ('financi statement ... prepar ... with',
   'financi ... in accord with',
   'financi ... prepar ... accord to',
   'prepar ... in accord with')],
 'auditor': [('have audit the', 'ha audit the'),
  ('manag ... is respons for the prepar and fair present',
   'the prepar of the ... are the respons of',
   'respons for prepar the ... in accord with',
   'these consolid financi statement are the respons of',
   'is respons for',
   'director ... are respons for the prepar',
   'are the respons of'),
  ('respons is to express an opinion',
   'respons is to audit the',
   'our resp