## Data Extraction Notebook Using Camelot

This notebook is used to extract information from a pdf file and saves the extracted data to a csv.

In [None]:
# Check if you're working in venv

import sys
print(sys.executable)

#### Install Packages

In [None]:
!pip install camelot-py[cv]
!pip install tabulate

In [12]:
import logging
import pandas as pd

# from pathlib import Path
from pypdf import PdfReader
from IPython.display import display
from pathlib import Path

In [9]:
# Set up logging
logging.getLogger("camelot").setLevel(logging.DEBUG)
logging.basicConfig(
    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
)


In [24]:
from pathlib import Path

def process_pdf(pdf_file, output_dir):
    pdf_file = Path(pdf_file)   # convert to Path

    print(f"Processing {pdf_file.name}")
    logging.info(f"Processing {pdf_file.name}")

    # Verify PDF can be opened with PdfReader before processing
    try:
        reader = PdfReader(str(pdf_file))
        if len(reader.pages) == 0:
            raise ValueError(f"No pages found in PDF {pdf_file.name}")
    except Exception as e:
        print(f"Failed to open PDF {pdf_file.name} with PdfReader: {e}")
        logging.error(f"Failed to open PDF {pdf_file.name} with PdfReader: {e}")
        return

    # Read tables from the PDF using camelot
    try:
        tables = camelot.read_pdf(str(pdf_file), pages="3-4")
    except Exception as e:
        print(f"Failed to read PDF {pdf_file.name}: {e}")
        logging.error(f"Failed to read PDF {pdf_file.name}: {e}")
        return

    if len(tables) == 0:
        print(f"No tables detected in {pdf_file.name}")
        logging.warning(f"No tables detected in {pdf_file.name}")
        return

    # Create a subdirectory for this PDF's output
    pdf_output_dir = output_dir / pdf_file.stem
    pdf_output_dir.mkdir(exist_ok=True)

    # Process individual tables
    for i, table in enumerate(tables):
        try:
            df = table.df

            print(f"\nTable {i+1} from {pdf_file.name}:")
            display(df)

            csv_path = pdf_output_dir / f"{pdf_file.stem}_table_{i+1}.csv"
            df.to_csv(csv_path, index=False)
            print(f"Saved to {csv_path}")

            print(f"\nTable {i+1} Parsing Report:")
            logging.info(f"Table {i+1} Parsing Report:")
            print(table.parsing_report)
            logging.info(table.parsing_report)

        except Exception as e:
            print(f"Failed to process or save table {i+1} from {pdf_file.name}: {e}")
            logging.error(
                f"Failed to process or save table {i+1} from {pdf_file.name}: {e}"
            )


In [25]:
# output_dir
output_dir = Path("output")
print(f"Output directory: {output_dir}")

Output directory: output


In [26]:
# Ensure output directory exists
output_dir.mkdir(exist_ok=True)

In [None]:
pdf_file = 'file_name.pdf'  # Replace with your PDF file path

In [None]:
process_pdf(pdf_file, output_dir)

print("Processing complete. Check the 'output' folder for results.")
logging.info("Processing complete. Check the 'output' folder for results.")

print("Script execution finished.")