# Imports

In [358]:
import re
import os
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
import time

import utility.utility as util
from utility.Extractor import PageNumberExtractor, PageTextExtractor, TermExtractor
import utility.text_cleaning as tc
import utility.extractor_meta as em
from datetime import datetime

# Progress Bars:
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# Below import and instructions simply for display
from IPython.display import display, HTML, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))

# resets import once changes have been applied
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [359]:
import nltk
nltk.data.path[:] = ['Q:\\Nasri\\utilities\\nltk_data']

# Paths

In [360]:
path_cwd = os.getcwd()
path_data = os.path.join(path_cwd, 'data')
path_raw_pdf = os.path.join(path_data, 'raw_pdf_files')
path_input_meta = os.path.join(path_data, 'input_meta')
path_testing_meta = os.path.join(path_data, 'testing_meta')

# Load Meta

In [361]:
meta_input = pd.read_csv(os.path.join(path_input_meta, 'full_test.csv'))
meta_input

Unnamed: 0,doc_id,doc_path
0,21175204,L:/pi_documents/documents_raw/acrobat_pdf_docu...
1,22399065,L:/pi_documents/documents_raw/acrobat_pdf_docu...
2,22415158,L:/pi_documents/documents_raw/pdf_navigator/22...
3,22430067,L:/pi_documents/documents_raw/pdf_navigator/22...
4,22430199,L:/pi_documents/documents_raw/pdf_navigator/22...
...,...,...
546,63471748,L:/pi_documents/documents_raw/pdf_navigator/63...
547,63688600,L:/pi_documents/documents_raw/pdf_navigator/63...
548,65265662,L:/pi_documents/documents_raw/acrobat_pdf_docu...
549,64651583,L:/pi_documents/documents_raw/pdf_navigator/64...


# Load Evaluation Data

In [372]:
testing_meta = pd.read_csv(os.path.join(path_testing_meta, 'ult_test.csv'))

# Test Page Number Extractor

### Single Test

In [983]:
section_anchors = {'notes': em._notes_sections, 'auditor': em._auditor_sections}
processed_section_anchors = util.process_section_anchors(section_anchors)
processed_section_anchors = {key: [util.prep_extract_anchors(a, '400') for v in value for a in v] for key, value in processed_section_anchors.items()}

In [1012]:
doc_id = 22032523


#mmtest.iloc[17:,:]['doc_id'].values[0]
section_anchors = {'notes': em._notes_sections, 'auditor': em._auditor_sections}
processed_section_anchors = util.process_section_anchors(section_anchors)

keep = 61145036, 60359654, 64036845
doc_id = doc_id
doc_path = 'L:/pi_documents/documents_raw/pdf_navigator/61635511.pdf'
doc_id = doc_path.split('/')[-1]
#doc_path = meta_input.loc[meta_input.doc_id == doc_id, "doc_path"].values[0]

res = PageNumberExtractor(doc_id=doc_id,
                          path=doc_path,
                          section_anchors=processed_section_anchors,
                          min_anchor_hit_ratio=.4,
                          flag_only_max_hits=False,
                          flag_allow_overlapping_sections=False,
                          flag_adjust_real_page_num=False,
                          flag_do_ocr=True,
                          thresh_ocr=100,
                          flag_allow_duplicate_hits_in_groups=True,
                          sections_do_grouping=['auditor'],
                          sections_with_page_skip_groups=None,).run()

res

notes
34 {1, 2, 3}
142 {2, 3}
252 {0, 1}
261 {2, 3}
264 {2, 3}
267 {2, 3}
297 {0, 1, 2, 3}
298 {2, 3}
301 {2, 3}
305 {0, 2, 3}
307 {0, 1, 2, 3}
315 {0, 2, 3}
324 {0, 1}
425 {0, 1, 2, 3}
434 {2, 3}
485 {0, 2, 3}
auditor
267 {6, 7}
307 {0, 1, 2, 4, 6, 7}


{'doc_id': '61635511.pdf',
 'doc_path': 'L:/pi_documents/documents_raw/pdf_navigator/61635511.pdf',
 'doc_num_pages': 489,
 'notes': [34,
  142,
  252,
  261,
  264,
  267,
  297,
  298,
  301,
  305,
  315,
  324,
  425,
  434,
  485],
 'auditor': [306, 307]}

### Loop Test

In [None]:
section_anchors = {'notes': em._notes_sections, 'auditor': em._auditor_sections}
processed_section_anchors = util.process_section_anchors(section_anchors)

results = []
for index, row in tqdm(meta_input.iterrows()):
    doc_id = row.doc_id
    doc_path = row.doc_path

    res = PageNumberExtractor(doc_id=doc_id,
                          path=doc_path,
                          section_anchors=processed_section_anchors,
                          min_anchor_hit_ratio=0.4,
                          flag_only_max_hits=False,
                          flag_allow_overlapping_sections=False,
                          flag_adjust_real_page_num=False,
                          flag_do_ocr=True,
                          flag_allow_duplicate_hits_in_groups=True,
                          sections_with_page_skip_groups=['auditor'],
                          thresh_ocr=100).run()

    results.append(res)
result_df = pd.DataFrame(results)

In [636]:
result_df

Unnamed: 0,doc_id,doc_path,doc_num_pages,notes,auditor
0,61777009,L:/pi_documents/documents_raw/pdf_navigator/61...,80,"[2, 32, 33, 35, 36, 37, 38, 39, 21]","[22, 23]"
1,63688600,L:/pi_documents/documents_raw/pdf_navigator/63...,216,"[3, 85, 87, 88, 89, 103, 104, 105, 106, 107, 1...","[92, 93, 94, 95]"
2,65265662,L:/pi_documents/documents_raw/acrobat_pdf_docu...,182,"[67, 115, 127]","[124, 125, 126]"
3,62963970,L:/pi_documents/documents_raw/pdf_navigator/62...,223,"[55, 56, 75, 135, 136, 145, 146, 147, 211, 216...",[118]
4,61874456,L:/pi_documents/documents_raw/pdf_navigator/61...,148,"[51, 52, 58, 62, 74, 75, 76, 77, 78, 79, 81, 8...",[68]
...,...,...,...,...,...
546,22415158,L:/pi_documents/documents_raw/pdf_navigator/22...,48,"[8, 11, 12, 14, 31, 33]","[23, 24, 25]"
547,22399065,L:/pi_documents/documents_raw/acrobat_pdf_docu...,50,"[18, 19, 21, 29, 30, 31, 32]",[25]
548,60316808,L:/pi_documents/documents_raw/acrobat_pdf_docu...,72,"[34, 35, 36, 37, 38, 39, 40]",[33]
549,64041464,L:/pi_documents/documents_raw/acrobat_pdf_docu...,254,"[29, 30, 31, 35, 74, 105, 106, 107, 112, 128, ...","[93, 94, 95, 96, 97, 199, 200]"


## Accuracy Page Num Extractor

In [637]:
merged = pd.merge(result_df, testing_meta, on='doc_id', how='left')
merged['audit_std_page'] = pd.to_numeric(merged['audit_std_page'], errors='coerce').astype('Int64')
merged['notes_std_page'] = pd.to_numeric(merged['notes_std_page'], errors='coerce').astype('Int64')
merged['hit_notes'] = merged.apply(lambda x: (x.notes_std_page in x.notes) if pd.notna(x.notes_std_page) else False, axis=1)
merged['hit_audit'] = merged.apply(lambda x: (x.audit_std_page in x.auditor) if pd.notna(x.audit_std_page) else False, axis=1)

def calc_acc(df, section):
    df = df[[f'hit_{section}']].copy()
    mask_True = df[f'hit_{section}'] == True
    hits = df[mask_True].count()
    mask_valid = df[f'hit_{section}'].isin([True, False])
    valid =df[mask_valid].count()
    return hits/valid

print('notes: ',calc_acc(merged, 'notes'))
print('auditor: ',calc_acc(merged, 'audit'))

notes:  hit_notes    0.647913
dtype: float64
auditor:  hit_audit    0.658802
dtype: float64


### Missing Page Nums

In [None]:
merged[merged.audit_std_page.isna() | merged.notes_std_page.isna()]

### Wrong Page

In [None]:
merged[~merged.hit_audit | ~merged.hit_notes]

# Test Text Extractor

In [182]:
meta_text_extraction = result_df.copy()

### Single Test

In [1007]:
extraction_anchors = {'notes': em._notes_standards, 'auditor': em._auditor_standards}
processed_extraction_anchors = util.process_section_anchors(extraction_anchors)

doc_id = 22032523
doc_path = meta_text_extraction.loc[meta_text_extraction.doc_id == doc_id, "doc_path"].values[0]

res = {}

for section, anchor in processed_extraction_anchors.items():
    section_pages = meta_text_extraction.loc[meta_text_extraction.doc_id == doc_id, section].values[0]
    res[section] = PageTextExtractor(doc_id = doc_id,
                                     path = doc_path,
                                     section = section,
                                     page_nums = section_pages,
                                     flag_reduce = False,
                                     anchors = anchor,
                                     anchor_add_word_window = 20,
                                     allowance_wildcards_reg_matches = 400,
                                     flag_do_ocr = False).run()

res

[autoreload of utility.Extractor failed: Traceback (most recent call last):
  File "Q:\Nasri\python_venvs\venv_ExtractSectionsPDFs\Lib\site-packages\IPython\extensions\autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "Q:\Nasri\python_venvs\venv_ExtractSectionsPDFs\Lib\site-packages\IPython\extensions\autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "Q:\Nasri\python_venvs\venv_ExtractSectionsPDFs\Lib\site-packages\IPython\extensions\autoreload.py", line 397, in update_generic
    update(a, b)
  File "Q:\Nasri\python_venvs\venv_ExtractSectionsPDFs\Lib\site-packages\IPython\extensions\autoreload.py", line 365, in update_class
    update_instances(old, new)
  File "Q:\Nasri\python_venvs\venv_ExtractSectionsPDFs\Lib\site-packages\IPython\extensions\autoreload.py", line 322, in update_instances
    if type(ref) is old:
       ^^^^^^^^^^^^^^^^
KeyboardInterrupt
]


NameError: name 'meta_text_extraction' is not defined

### Loop Test

In [201]:
for index, row in tqdm(meta_text_extraction.iterrows()):
    doc_id = row.doc_id
    doc_path = row.doc_path
    for section, anchors in processed_extraction_anchors.items():
        section_pages = row[section]
        if f"{section}_terms" not in meta_text_extraction.columns:
            meta_text_extraction[f"{section}_terms"] = None
            meta_text_extraction[f"{section}_terms"] = meta_text_extraction[f"{section}_terms"].astype('object')

        res = PageTextExtractor(doc_id = doc_id,
                                path = doc_path,
                                section = section,
                                page_nums = section_pages,
                                flag_reduce = False,
                                anchors = anchors,
                                anchor_add_word_window = 20,
                                allowance_wildcards_reg_matches = 400,
                                flag_do_ocr = False).run()
        
        meta_text_extraction.loc[meta_text_extraction.doc_id == doc_id,f"{section}_num_tokens"] = res[1]
        meta_text_extraction.loc[meta_text_extraction.doc_id == doc_id,f"{section}_text"] = res[0]
        meta_text_extraction.at[index,f"{section}_terms"] = res[2]

# aggregate token estimate
cols_num_tokens = [col for col in meta_text_extraction.columns if "num_tokens" in col]
if 'total_num_tokens' in meta_text_extraction.columns:
    meta_text_extraction['total_num_tokens'] = 0
meta_text_extraction['total_num_tokens'] = meta_text_extraction[cols_num_tokens].sum(axis=1)

11it [00:02,  4.73it/s]


### Estimate Number of Tokens

In [202]:
meta_text_extraction[['auditor_num_tokens', 'notes_num_tokens', 'total_num_tokens']].describe()

Unnamed: 0,auditor_num_tokens,notes_num_tokens,total_num_tokens
count,11.0,11.0,11.0
mean,1378.363636,6685.727273,8064.090909
std,903.136565,3150.242533,3634.535334
min,260.0,1673.0,1933.0
25%,569.0,4468.5,5890.5
50%,1507.0,6322.0,7835.0
75%,1859.0,8907.0,9960.0
max,2924.0,11382.0,14306.0


# Test Term Extractor

In [932]:
meta_term_extraction = pd.read_parquet(os.path.join(path_data, 'extracted_page_nums', 'page_nums_complete_24_05_15_19_48.parquet')) #result_df.copy()
meta_term_extraction

Unnamed: 0,doc_id,doc_path,doc_num_pages,notes,auditor
0,61112220,L:/pi_documents/documents_raw/pdf_navigator/61...,45,"[17, 24, 36]","[42, 43, 44]"
1,61857037,L:/pi_documents/documents_raw/pdf_navigator/61...,37,"[6, 7, 19, 22, 23]","[10, 11]"
2,61139040,L:/pi_documents/documents_raw/pdf_navigator/61...,34,"[16, 25]","[28, 29]"
3,60492793,L:/pi_documents/documents_raw/acrobat_pdf_docu...,52,"[22, 26, 36]",[32]
4,60910983,L:/pi_documents/documents_raw/pdf_navigator/60...,28,"[8, 14]",[9]
...,...,...,...,...,...
166,173072492,L:/pi_documents/documents_raw/sec_full_submiss...,422,"[75, 100, 110, 214, 227, 235, 236, 250, 297, 2...","[91, 92, 149, 150, 151, 152, 153, 154, 155, 15..."
167,61796709,L:/pi_documents/documents_raw/pdf_navigator/61...,58,"[2, 17, 22, 30, 52]","[36, 37]"
168,64396210,L:/pi_documents/documents_raw/pdf_navigator/64...,89,"[16, 45, 46, 57, 65, 66, 77]","[40, 41, 42, 44, 61, 62, 63]"
169,64926379,L:/pi_documents/documents_raw/acrobat_pdf_docu...,258,"[83, 145, 159, 163, 178, 179, 180]","[160, 161, 163, 164]"


## Single Test

In [1013]:
extraction_anchors = {'notes': em._notes_standards, 'auditor': em._auditor_standards}
processed_extraction_anchors = util.process_section_anchors(extraction_anchors)

mv_anchors = {'notes': em._notes_sections, 'auditor': em._auditor_sections}
processed_mv_anchors = util.process_section_anchors(mv_anchors)
flag_mv = {'auditor': False, 'notes': False}

doc_id = 60506045

doc_path = 'L:/pi_documents/documents_raw/pdf_navigator/61635511.pdf'
doc_id = doc_path.split('/')[-1]
#doc_path = meta_term_extraction.loc[meta_term_extraction.doc_id == doc_id, "doc_path"].values[0]

print(doc_path)
meta_term_extraction.loc[meta_term_extraction.doc_id == doc_id, 'notes auditor'.split()]  = [[[34,
  142,
  252,
  261,
  264,
  267,
  297,
  298,
  301,
  305,
  315,
  324,
  425,
  434,
  485]], ]

res = {}

for section, anchors in processed_extraction_anchors.items():
    section_pages = meta_term_extraction.loc[meta_term_extraction.doc_id == doc_id][section].values[0]
    
    res[section] = TermExtractor(doc_id = doc_id,
                                 path = doc_path,
                                 section = section,
                                 page_nums = section_pages,
                                 anchors = anchors,
                                 anchor_add_word_window = 20,
                                 allowance_wildcards_reg_matches = 600,
                                 flag_capture_surrounding_sentences = True,
                                 surrounding_sentences_margin = 2,
                                 flag_do_ocr = True,
                                 thresh_ocr= 100,
                                 flag_para_majority_voting=flag_mv[section],
                                 anchors_mv = processed_mv_anchors[section]).run()


for section in res:
    print(section)
    print()
    for ir in res[section]:
        print(ir)
        print()
    print('---')

L:/pi_documents/documents_raw/pdf_navigator/61635511.pdf
Empty DataFrame
Columns: [notes, auditor]
Index: []


IndexError: index 0 is out of bounds for axis 0 with size 0

## Loop Test

In [None]:
for index, row in tqdm(meta_term_extraction.iterrows()):
    doc_id = row.doc_id
    doc_path = row.doc_path
    for section, anchors in processed_extraction_anchors.items():
        section_pages = row[section]
        if f"{section}_terms" not in meta_term_extraction.columns:
            meta_term_extraction[f"{section}_terms"] = None
            meta_term_extraction[f"{section}_terms"] = meta_term_extraction[f"{section}_terms"].astype('object')
        if f"{section}_sentences" not in meta_term_extraction.columns:
            meta_term_extraction[f"{section}_sentences"] = None
            meta_term_extraction[f"{section}_sentences"] = meta_term_extraction[f"{section}_sentences"].astype('object')
        if f"{section}_paragraphs" not in meta_term_extraction.columns:
            meta_term_extraction[f"{section}_paragraphs"] = None
            meta_term_extraction[f"{section}_paragraphs"] = meta_term_extraction[f"{section}_paragraphs"].astype('object')

        res = TermExtractor(doc_id = doc_id,
                            path = doc_path,
                            section = section,
                            page_nums = section_pages,
                            anchors = anchors,
                            anchor_add_word_window = 20,
                            allowance_wildcards_reg_matches = 400,
                            flag_capture_surrounding_sentences = True,
                            surrounding_sentences_margin = 2,
                            flag_do_ocr = False).run()

        paragraphs = ""
        for inter, para in res[0].items():
            paragraphs += ' '.join(para) + '\n\n'

        num_tokens = util.count_tokens(paragraphs)            
        
        meta_term_extraction.loc[meta_term_extraction.doc_id == doc_id,f"{section}_num_tokens"] = num_tokens
        meta_term_extraction.loc[meta_term_extraction.doc_id == doc_id,f"{section}_paragraphs"] = paragraphs
        meta_term_extraction.at[index,f"{section}_sentences"] = res[1]
        meta_term_extraction.at[index,f"{section}_terms"] = res[2]

# aggregate token estimate
cols_num_tokens = [col for col in meta_term_extraction.columns if "num_tokens" in col]
if 'total_num_tokens' in meta_term_extraction.columns:
    meta_term_extraction['total_num_tokens'] = 0
meta_term_extraction['total_num_tokens'] = meta_term_extraction[cols_num_tokens].sum(axis=1)

In [None]:
meta_term_extraction

In [None]:
meta_term_extraction[['auditor_num_tokens', 'notes_num_tokens', 'total_num_tokens']].describe()