In [7]:
import camelot  # Library to extract tables from PDFs using machine learning and heuristics
import pdfplumber  # Library to extract text and tables from PDFs
import pandas as pd  # Library for data manipulation and analysis

def extract_scope_tables(pdf_path):
    """
    Extract all tables containing Scope 1 or Scope 2 data from a PDF using Camelot and pdfplumber,
    and preview the extracted tables.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        pd.DataFrame: A DataFrame containing all extracted tables with Scope 1 or Scope 2 data.
    """
    extracted_tables = []  # Initialize an empty list to store extracted tables

    def process_table(df, source):
        """Check if the table contains Scope 1 or Scope 2 and add it to the results."""
        table_text = df.to_string(index=False, header=False)  # Convert table to string for searching keywords
        if "Scope 1" in table_text or "Scope 2" in table_text:  # Check if the table contains relevant data
            print(f"Table with Scope 1 or Scope 2 found in {source}")  # Log the source of the table
            print("\nExtracted Table Preview:")
            print(df.head())  # Preview the first few rows of the table
            extracted_tables.append({  # Add the source and content of the table to the results
                "Source": source,
                "Table Data": df.to_string()
            })

    # 1. Use Camelot to extract tables
    print("Using Camelot...")  # Log the start of Camelot processing
    try:
        camelot_tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")  # Extract tables using Camelot
        for table_index, table in enumerate(camelot_tables):  # Loop through each extracted table
            df = table.df  # Convert Camelot table to a DataFrame
            process_table(df, f"Camelot Table {table_index + 1}")  # Process the table for relevant data
    except Exception as e:
        print(f"Error using Camelot: {e}")  # Handle any errors during Camelot extraction

    # 2. Use pdfplumber to extract tables
    print("\nUsing pdfplumber...")  # Log the start of pdfplumber processing
    with pdfplumber.open(pdf_path) as pdf:  # Open the PDF file using pdfplumber
        for page_num, page in enumerate(pdf.pages, start=1):  # Loop through each page of the PDF
            tables = page.extract_tables()  # Extract tables from the current page
            for table_index, table in enumerate(tables):  # Loop through each extracted table on the page
                df = pd.DataFrame(table)  # Convert the table to a DataFrame
                process_table(df, f"pdfplumber (Page {page_num}, Table {table_index + 1})")  # Process the table

    # Convert the list of extracted tables to a DataFrame
    scope_df = pd.DataFrame(extracted_tables)
    return scope_df

# Run the function on your PDF file
pdf_path = "RW1lmju.pdf"  # Replace with your file path
scope_tables_df = extract_scope_tables(pdf_path)  # Extract tables from the PDF

# Display results
print("\nExtracted Tables:")
print(scope_tables_df)  # Print the extracted tables

# Save the extracted tables to a CSV file
scope_tables_df.to_csv("Scope_Tables.csv", index=False)
print("Tables containing Scope 1 or Scope 2 saved to Scope_Tables.csv")  # Log the saving of tables

Using Camelot...


  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


Table with Scope 1 or Scope 2 found in Camelot Table 3

Extracted Table Preview:
                                            0        1        2        3  \
0         1.1  Greenhouse gas (GHG) emissions                              
1  Table 1A – GHG emissions by scope (mtCO2e)                              
2                                                 FY20     FY21     FY22   
3                                     Scope 1  118,100  123,704  139,413   
4                                     Scope 2                              

         4  
0           
1           
2     FY23  
3  144,960  
4           
Table with Scope 1 or Scope 2 found in Camelot Table 4

Extracted Table Preview:
              0                                                  1        2  \
0                Table 1B – GHG emissions by scope (mtCO2e) wit...            
1                                                             FY20     FY21   
2  Scope 1 + 21                                            574,219