## The notebook is designed to:
### Process multiple PDF files in a batch
### Extract all tables from each PDF
### Save each extracted table as a separate CSV file
### Maintain the original file naming convention while adding table numbers to the output files

In [50]:
import camelot
import os
import pandas as pd

In [51]:
def extract_tables_from_pdf(pdf_dir, pdf_file, output_folder, pages="all"):
    # Using lattice mode as an example, which works well for PDFs with clear table borders
    tables = camelot.read_pdf(os.path.join(pdf_dir, pdf_file), pages=pages, flavor='lattice', line_scale=60)
    report_name = os.path.splitext(pdf_file)[0]

    # Check if any tables are found
    if tables.n == 0:
        print(f"No table from '{pdf_file}' saved")
        return False  # Return False if no tables found
    
    # Iterate over found tables and save them as CSV
    for i, table in enumerate(tables, start=1):
        # Save the table as a CSV
        csv_file=f"{report_name}_table_camelot_{i}.csv"
        csv_path = os.path.join(output_folder, csv_file)
        table.to_csv(csv_path)
        print(f"Table {i} from '{pdf_file}' saved as '{csv_file}'")
    
    return True  # Return True if tables were found

In [52]:
## specify input  and output
work_dir=".."
pdf_dir = work_dir+"/data/CoPath_All_pdfs/OncoKids_split"  # Change this to your PDF file path
output_folder = work_dir+"/table/extracted_tables/CoPath_OncoKids_All"  # Change this to your desired output folder path


In [53]:
# Create a list to store PDFs with no tables
no_tables_pdfs = []

# Process all PDFs
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith(".pdf"):
        has_tables = extract_tables_from_pdf(pdf_dir, pdf_file, output_folder)
        if not has_tables:
            no_tables_pdfs.append(pdf_file)

# Create DataFrame of PDFs with no tables
no_tables_df = pd.DataFrame(no_tables_pdfs, columns=['PDF_Filename'])

# Save the DataFrame to CSV
no_tables_csv_path = os.path.join(output_folder, 'pdfs_with_no_tables.csv')
no_tables_df.to_csv(no_tables_csv_path, index=False)
print(f"\nPDFs with no tables saved to: {no_tables_csv_path}")
print(f"Number of PDFs with no tables: {len(no_tables_pdfs)}")

No table from 'OncoKids_2022_S22-4649.pdf' saved
Table 1 from 'OncoKids_2022_S22-5345.pdf' saved as 'OncoKids_2022_S22-5345_table_camelot_1.csv'
Table 2 from 'OncoKids_2022_S22-5345.pdf' saved as 'OncoKids_2022_S22-5345_table_camelot_2.csv'
Table 1 from 'OncoKids_2022_C22-285.pdf' saved as 'OncoKids_2022_C22-285_table_camelot_1.csv'
Table 1 from 'OncoKids_2022_H22-4504.pdf' saved as 'OncoKids_2022_H22-4504_table_camelot_1.csv'
Table 1 from 'OncoKids_2022_S22-4298.pdf' saved as 'OncoKids_2022_S22-4298_table_camelot_1.csv'
Table 2 from 'OncoKids_2022_S22-4298.pdf' saved as 'OncoKids_2022_S22-4298_table_camelot_2.csv'
Table 3 from 'OncoKids_2022_S22-4298.pdf' saved as 'OncoKids_2022_S22-4298_table_camelot_3.csv'
Table 1 from 'OncoKids_2022_S22-4850.pdf' saved as 'OncoKids_2022_S22-4850_table_camelot_1.csv'
Table 1 from 'OncoKids_2022_S22-3875.pdf' saved as 'OncoKids_2022_S22-3875_table_camelot_1.csv'
Table 2 from 'OncoKids_2022_S22-3875.pdf' saved as 'OncoKids_2022_S22-3875_table_camelot_