## Log Investigation --> Training Data Creation

In [None]:
import os
import shutil

import numpy as np
import pandas as pd
import PyPDF2 as p
from pdf2image import convert_from_path
from PIL import Image

import philaudit as pa

In [None]:
df = pd.read_csv("../run_logs/part3_pgs.csv")

In [None]:
df

___

### Find error files and save them out

In [None]:
errors = df.loc[df.part3_range.str.contains("Error")]
print(errors.shape)
errors

In [None]:
errors.to_csv("../run_logs/part3_errors.csv")
df = df.loc[~df.part3_range.str.contains("Error")]

___

### Find files with fractured ranges

These come from removing blank pages and pages without tables from the detected range

In [None]:
broken_range_files = df.loc[df.part3_range.apply(lambda x: len(eval(x))>1), 'file'].to_list()

In [None]:
len(broken_range_files)

___

### Expand the range objects and concatenate them to a single list

In [None]:
def unpack_ranges(lst):
    net = []
    for rng in lst:
        net.extend(rng)
    return net

In [None]:
df['expanded']=(
    df.part3_range
    .apply(lambda x: eval(x))
    .apply(lambda y: list(y[0]) if len(y)==1 else unpack_ranges(y)) 
)

In [None]:
df.expanded

___

#### `Expanded` investigation

Finding files that need to be tested individually...

In [None]:
long_ranges = df.loc[df.expanded.apply(len) > 30]
print(long_ranges.shape)
long_ranges.head()

In [None]:
long_broken_ranges = long_ranges.loc[
    long_ranges.file.apply(lambda x: x in broken_range_files)
]

In [None]:
print(long_broken_ranges.shape)
long_broken_ranges.head()

In [None]:
def move_these_to_new_test_folder(long_broken_ranges):
    files = long_broken_ranges.file.to_list()
    for file in files:
        shutil.move(file, "./philaudit/test/new_testers/")
# move_these_to_new_test_folder(long_broken_ranges)

___

In [None]:
# "predictions" df for all files
df.head()

In [None]:
test_files = [file for file in os.listdir("./philaudit/test/pdf/") if file.endswith(".pdf")]

In [None]:
sliver = df.loc[df.file.apply(lambda x: x.split('/')[-1]).isin(test_files)]
sliver = sliver.reset_index(drop=True)
sliver

In [None]:
sliver

In [None]:
df.loc[df.file.str.contains("Pidigan2011")]


---

## Creating Training Dataset!

In [None]:
include = []
exclude = []
# 1. remove the long_broken_ranges from df
# Now we have only files for which my current algorithm works
# Checkout philaudit/test for more details
# 2. for each of the good documents, we just take the difference between 
# the indicies in reader.pages[] from the extended range
# 3. leftover of reader.pages[] will be exclude and vice versa for include 
#   --> easy labeling!

In [None]:
to_remove = long_broken_ranges.index.to_list()
df = df.drop(to_remove, axis=0)
df.shape

In [None]:
df.drop('broken', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df['include'] = df.expanded

In [None]:
def generate_exclusion_pages_data_for_file(file, include):
    reader = p.PdfReader(file)
    include = set(include)
    pages = set(i for i in range(0, len(reader.pages)))
    exclude = pages - include
    return list(exclude)

In [None]:
ex_data = []
for _, row in df.iterrows():
    file = row.file
    include = row.include
    exclude = generate_exclusion_pages_data_for_file(file, include)
    ex_data.append(exclude)

In [None]:
df['exclude'] = ex_data

In [50]:
df.head()

Unnamed: 0,file,part3_range,expanded,include,exclude
0,./pdf/09-Bongabong2013_Part3-Status_of_PY's_Re...,"[range(0, 24)]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[]
1,./pdf/01-Umingan2013_Audit_Report.pdf,"[range(76, 79)]","[76, 77, 78]","[76, 77, 78]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,./pdf/05-Binalbagan2013_Part3-Status_of_Implem...,"[range(0, 8)]","[0, 1, 2, 3, 4, 5, 6, 7]","[0, 1, 2, 3, 4, 5, 6, 7]",[8]
3,./pdf/01-Sallapadan2012_Audit_Report.pdf,"[range(60, 71)]","[60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]","[60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
4,./pdf/01-Sudipen2013_Audit_Report.pdf,"[range(41, 48)]","[41, 42, 43, 44, 45, 46, 47]","[41, 42, 43, 44, 45, 46, 47]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


In [51]:
pdf_folder = './pdf'
output_folder = './target_page_detection/training_data'

In [70]:
def convert_pdf_to_images(pdf_path, output_folder):
    images = convert_from_path(pdf_path, fmt='png')
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    
    for idx, img in enumerate(images):
        img_path = os.path.join(output_folder, f"{filename}_page_{idx + 1}.png")
        img.save(img_path, 'png')

def move_images_to_categories(df, output_folder):
    for _, row in df.iterrows():
        filename = row.file.split('/')[-1][:-4]
        include_indexes = row.include
        exclude_indexes = row.exclude

        for idx in include_indexes:
            src_path = os.path.join(output_folder, f"{filename}_page_{idx + 1}.png")
            dest_path = os.path.join(output_folder, 'include', f"{filename}_page_{idx + 1}.png")
            shutil.move(src_path, dest_path)

        for idx in exclude_indexes:
            src_path = os.path.join(output_folder, f"{filename}_page_{idx + 1}.png")
            dest_path = os.path.join(output_folder, 'exclude', f"{filename}_page_{idx + 1}.png")
            shutil.move(src_path, dest_path)


In [71]:
# Convert PDFs to images and move them to the correct folders
# for path in df['file']:
#     convert_pdf_to_images(path, output_folder)

move_images_to_categories(df, output_folder)
