# Imports

In [1]:
import re
import os
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
import time

import utility.utility as util
from utility.Extractor import PageNumberExtractor, PageTextExtractor, TermExtractor
import utility.text_cleaning as tc
import utility.extractor_meta as em
from datetime import datetime

# Progress Bars:
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# Below import and instructions simply for display
from IPython.display import display, HTML, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))

# resets import once changes have been applied
%load_ext autoreload
%autoreload 2

# Paths

In [2]:
path_cwd = os.getcwd()
path_data = os.path.join(path_cwd, 'data')
path_raw_pdf = os.path.join(path_data, 'raw_pdf_files')
path_raw_txt = os.path.join(path_data, 'raw_text_files')
path_output = os.path.join(path_cwd, 'output')

## File paths

In [3]:
paths_pdf_files = {file.split('.')[0]: os.path.join(path_raw_pdf, file) for file in os.listdir(path_raw_pdf) if file != '.gitkeep'}
ids = list(set(paths_pdf_files.keys()))
len(ids)

183

# Prepare Sections Anchors

In [4]:
section_anchors = {'notes': em._notes_sections, 'auditor': em._auditor_sections}
processed_section_anchors = util.process_section_anchors(section_anchors)

# Create Extractors - rewrite to only create class during execution, saves memory

In [5]:
page_num_extractors = {k: PageNumberExtractor(doc_id = k,
                                              path = v,
                                              section_anchors = processed_section_anchors,
                                              min_anchor_hit_ratio = 0.5, 
                                              flag_only_max_hits = False,
                                              flag_allow_overlapping_sections = False,                                              
                                              flag_adjust_real_page_num = False,
                                              flag_do_ocr = False,
                                              flag_allow_duplicate_hits_in_groups = True,
                                              sections_with_page_skip_groups = ['auditor'],
                                              thresh_ocr =100) for k, v in tqdm(paths_pdf_files.items())}

100%|███████████████████████████████████████████████████████████| 183/183 [00:00<00:00, 94573.39it/s]


In [None]:
results = []
for id in tqdm(page_num_extractors):
    e = page_num_extractors[id]
    results.append(e.run())
    del e
result_df = pd.DataFrame(results)

In [None]:
test_df = pd.read_excel(os.path.join(path_data, 'testing_meta', 'sample_8april3.xlsx'))
test_df = test_df['doc_id audit_std_page notes_std_page done'.split()].copy()
test_df['audit_std_page'] = pd.to_numeric(test_df['audit_std_page'], errors='coerce').astype('Int64') - 1 
test_df['notes_std_page'] = pd.to_numeric(test_df['notes_std_page'], errors='coerce').astype('Int64') - 1
test_df['doc_id'] = test_df['doc_id'].astype(str)
test_df2 = pd.read_excel(os.path.join(path_data, 'testing_meta', 'fix_Sara_9_43.xlsx'))
test_df2 = test_df2['doc_id audit_std_page notes_std_page'.split()].copy()
test_df2['audit_std_page'] = pd.to_numeric(test_df2['audit_std_page'], errors='coerce').astype('Int64') - 1 
test_df2['notes_std_page'] = pd.to_numeric(test_df2['notes_std_page'], errors='coerce').astype('Int64') - 1
test_df2['doc_id'] = test_df2['doc_id'].astype(str)

for index, row in test_df2.iterrows():
    test_df.loc[test_df.doc_id == row.doc_id, 'audit_std_page'] = row.audit_std_page
    test_df.loc[test_df.doc_id == row.doc_id, 'notes_std_page'] = row.notes_std_page

In [None]:
merged = pd.merge(result_df, test_df, on='doc_id', how='left')


In [None]:
merged['hit_notes'] = merged.apply(lambda x: (x.notes_std_page in x.notes) if pd.notna(x.notes_std_page) else False, axis=1)
merged['hit_audit'] = merged.apply(lambda x: (x.audit_std_page in x.auditor) if pd.notna(x.audit_std_page) else False, axis=1)

In [None]:
def calc_acc(df, section):
    df = df[[f'hit_{section}']].copy()
    mask_True = df[f'hit_{section}'] == True
    hits = df[mask_True].count()
    mask_valid = df[f'hit_{section}'].isin([True, False])
    valid =df[mask_valid].count()
    return hits/valid

print('notes: ',calc_acc(merged, 'notes'))
print('auditor: ',calc_acc(merged, 'audit'))

# Save Found Page Numbers

In [None]:
curr_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
result_df.to_parquet(os.path.join(path_output, f"page_nums_{curr_time}.parquet"), index=False)

# Extract Pages and prep text

## Load Meta Page File

In [7]:
file = "page_nums_2024_04_18_03_31.parquet"
sections = ['auditor', 'notes']
meta_df = pd.read_parquet(os.path.join(path_output, file))

In [8]:
meta_df['path_doc'] = meta_df.doc_id.apply(lambda x: os.path.join(path_raw_pdf, f"{x}.pdf"))

In [9]:
meta_ids = meta_df['doc_id'].values

In [None]:
extraction_anchors = {'notes': em._notes_standards, 'auditor': em._auditor_standards}
processed_extraction_anchors = util.process_section_anchors(extraction_anchors)

In [None]:
results = {}
for index, row in tqdm(meta_df.iterrows()):
    path = row.path_doc
    id = row.doc_id
    for section in sections:
        if f"{section}_terms" not in meta_df.columns:
            meta_df[f"{section}_terms"] = None
            meta_df[f"{section}_terms"] = meta_df[f"{section}_terms"].astype('object')
        sections_pages = row[section]
        key = tuple([id, section])
        results[key] = PageTextExtractor(doc_id = id, 
                                         path = path, 
                                         section = section, 
                                         page_nums = sections_pages, 
                                         flag_reduce = False, 
                                         anchors = processed_extraction_anchors[section], 
                                         anchor_add_word_window = 20, 
                                         allowance_wildcards_reg_matches = 400,
                                         flag_do_ocr = False).run()
        meta_df.loc[meta_df.doc_id == id,f"{section}_num_tokens"] = results[key][1]
        meta_df.loc[meta_df.doc_id == id,f"{section}_text"] = results[key][0]
        meta_df.at[index,f"{section}_terms"] = results[key][2]

In [None]:
cols_num_tokens = [col for col in meta_df.columns if "num_tokens" in col]
if 'total_num_tokens' in meta_df.columns:
    meta_df['total_num_tokens'] = 0
meta_df['total_num_tokens'] = meta_df[cols_num_tokens].sum(axis=1)

In [None]:
meta_df.auditor_num_tokens.describe()

In [None]:
meta_df.loc[2]

In [None]:
for index, value in meta_df.loc[1, ['auditor_text', 'auditor_terms', 'notes_text', 'notes_terms']].items():
    print(index)
    print(value)
    print()

In [16]:
meta_df.head()

Unnamed: 0,doc_id,doc_num_pages,notes,auditor,path_doc
0,21175204,15,"[6, 9, 12]",[2],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...
1,22399065,50,"[18, 19, 21, 29, 30, 31, 32]",[25],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...
2,22415158,48,"[8, 11, 12, 14, 31, 33, 23]","[24, 25]",C:\Users\ilias\Desktop\UniMaResearch2023\Extra...
3,22430067,51,"[1, 6, 7, 21, 22, 31, 16]",[15],C:\Users\ilias\Desktop\UniMaResearch2023\Extra...
4,22430199,116,"[22, 47, 49, 50, 62, 63, 64, 65, 66, 67, 68, 6...","[52, 53, 54, 55, 56]",C:\Users\ilias\Desktop\UniMaResearch2023\Extra...


# Testing Area

In [32]:
extraction_anchors = {'notes': em._notes_standards, 'auditor': em._auditor_standards}
processed_extraction_anchors = util.process_section_anchors(extraction_anchors)

In [44]:
id = '22430199'
path = paths_pdf_files[id]
section = 'auditor'
sections_pages = meta_df.loc[meta_df.doc_id == id][section].values[0]
print(sections_pages)
key = tuple([id, section])
text = PageTextExtractor(doc_id = id, 
                                 path = path,
                                 section = section,
                                 page_nums = sections_pages,
                                 flag_reduce = True,
                                 anchors = processed_extraction_anchors[section],
                                 anchor_add_word_window = 20,
                                 allowance_wildcards_reg_matches = 400,
                                 flag_do_ocr = False).run()

term = TermExtractor(doc_id = id, 
                     path = path,
                     section = section,
                     page_nums = sections_pages, 
                     anchors = processed_extraction_anchors[section], 
                     anchor_add_word_window = 20, 
                     allowance_wildcards_reg_matches = 400,
                     flag_capture_surrounding_sentences = True,
                     surrounding_sentences_margin = 2,
                     flag_do_ocr = False).run()



[52 53 54 55 56]
{58: [(8, 319)]}


AttributeError: 'TermExtractor' object has no attribute 'prepare_results'

In [None]:
import re

pattern = 'in our opinion .{,300}? the .{,300}? consolid financi statement .{,300}? in accord with'
text = "in our opinion the consolid financi statement present fairli in all materi respect the consolid financi posit of westport innov inc. a at decemb 31 2011 and march 31 2011 and it consolid result of oper and it consolid cash flow for the ninemonth period end decemb 31 2011 and the year end march 31 2011 and march 31 2010 in accord with gener accept account principl in the unit state"

# Use the regex to search the text
match = re.search(pattern, text)

if match:
    print("Match found:", match.group())
else:
    print("No match found.")

In [None]:
detm = merged[~merged.hit_audit | ~merged.hit_notes].copy()
mask = ~((detm.audit_std_page.isna() & detm.hit_notes) | (detm.notes_std_page.isna() & detm.hit_audit))

In [None]:
detm[mask].reset_index(drop=True)

In [None]:
id = '22430067'
path = paths_pdf_files[id]
ex = PageNumberExtractor(doc_id = id,
                         path = path,
                         section_anchors = processed_section_anchors,
                         min_anchor_hit_ratio = 0.5, 
                         flag_only_max_hits = False,
                         flag_allow_overlapping_sections = False,  
                         flag_adjust_real_page_num = False,
                         flag_do_ocr = False,
                         flag_allow_duplicate_hits_in_groups = True,
                         sections_with_page_skip_groups = ['auditor'],
                         thresh_ocr =100)
result = ex.run()

In [None]:
result

In [None]:
merged[merged.audit_std_page.isna() | merged.notes_std_page.isna()]

In [None]:
processed_section_anchors

In [None]:
ls = []
if ls:
    print(123)