## The notebook is designed to:
### Process multiple PDF files in a batch
### Extract all tables from each PDF
### Save each extracted table as a separate CSV file
### Maintain the original file naming convention while adding table numbers to the output files

In [1]:
import os
import sys
import pandas as pd

# Add the src directory to the path so we can import our modules
sys.path.append('..')

# Import our custom modules
from src.table_utils import extract_tables_from_pdf, process_pdf_directory, save_pdfs_without_tables

In [2]:
## specify input and output directories
work_dir = ".."
pdf_dir = work_dir + "/data/CoPath_All_pdfs/OncoKids_split"  # Change this to your PDF file path
output_folder = work_dir + "/table/extracted_tables/CoPath_OncoKids_All"  # Change this to your desired output folder path

In [3]:
# Process all PDFs in the directory
no_tables_pdfs = process_pdf_directory(pdf_dir, output_folder)

# Save the list of PDFs with no tables
no_tables_csv_path = save_pdfs_without_tables(no_tables_pdfs, output_folder)

# Print summary information
print(f"\nPDFs with no tables saved to: {no_tables_csv_path}")
print(f"Number of PDFs with no tables: {len(no_tables_pdfs)}")

In [4]:
# Optional: If you want to extract tables using the stream flavor instead
# This section demonstrates how to use the extract_tables_from_pdf function with different parameters

# Example for a specific PDF with the stream flavor:
# pdf_to_reprocess = "OncoKids_2022_S22-4649.pdf"  # Replace with an actual PDF name
# if pdf_to_reprocess in no_tables_pdfs and os.path.exists(os.path.join(pdf_dir, pdf_to_reprocess)):
#     print(f"Reprocessing {pdf_to_reprocess} with stream flavor...")
#     has_tables = extract_tables_from_pdf(
#         pdf_dir, 
#         pdf_to_reprocess, 
#         output_folder,
#         flavor='stream',  # Using stream instead of lattice
#         line_scale=40     # Different line scale
#     )
#     print(f"Tables found: {has_tables}")