In [2]:
import os
import pdfplumber
import pandas as pd

In [None]:
def extract_tables_with_structure(pdf_path):
    """
    Extract all tables from a PDF file, preserving their exact structure.
    Returns a list of DataFrames representing each table exactly as in the PDF.
    """
    table_dfs = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            #print(f"Processing PDF: {pdf_path} ({len(pdf.pages)} pages)")
            
            for page_num, page in enumerate(pdf.pages, 1):
                tables = page.extract_tables()
                
                if tables:
                    #print(f"Page {page_num}: Found {len(tables)} table(s)")
                    
                    for table_num, table in enumerate(tables, 1):
                        if table and len(table) > 0:
                            # Convert to DataFrame 
                            df = pd.DataFrame(table)
                            
                            # Add metadata 
                            df.attrs['page_number'] = page_num
                            df.attrs['table_number'] = table_num
                            
                            # Store in DataFrame
                            table_dfs.append(df)
    
    except Exception as e:
        print(f"Error extracting tables: {e}")
    
    return table_dfs

In [5]:
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)


In [None]:
def display_tables(tables):
    """
    Display tables with metadata in a readable format.
    """
    if not tables:
        print("No tables found in the PDF.")
        return
    
    print(f"\nFound {len(tables)} tables in total:")
    
    for i, df in enumerate(tables, 1):
        page = df.attrs.get('page_number', 'Unknown')
        table_num = df.attrs.get('table_number', 'Unknown')
        
        print(f"\n TABLE {i} (Page {page}, Table #{table_num})")
        print(df.to_string(index=False))
        print("=" * 50)

In [None]:
def save_tables_to_excel(tables, pdf_filename):
    """
    Save all tables to a single Excel file with multiple sheets.
    """
    if not tables:
        return
    
    excel_filename = f"{os.path.splitext(os.path.basename(pdf_filename))[0]}_tables.xlsx"
    
    with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
        for i, df in enumerate(tables, 1):
            page = df.attrs.get('page_number', 'Unknown')
            table_num = df.attrs.get('table_number', 'Unknown')
            
            # Create excel sheet 
            sheet_name = f"Page{page}_Table{table_num}"
            if len(sheet_name) > 31:
                sheet_name = f"Table{i}"
                
            # for Excel
            df.to_excel(writer, sheet_name=sheet_name, index=False)
    
    print(f"Saved all tables to {excel_filename}")


In [None]:
def process_pdf_tables(pdf_path):
    """
    Process a PDF file and extract tables with their exact structure.
    """
    if not os.path.exists(pdf_path):
        print(f"File not found: {pdf_path}")
        return []
    
    # Extract tables
    tables = extract_tables_with_structure(pdf_path)
    
    # Display tables
    display_tables(tables)
    
    # Save tables in excel
    #save_tables_to_excel(tables, pdf_path)
    
    return tables

if __name__ == "__main__":
    pdf_file = "FACETS OF SENTIMENT ANALYSIS.pdf" 
    process_pdf_tables(pdf_file)


Processing PDF: FACETS OF SENTIMENT ANALYSIS.pdf (2 pages)
Page 1: Found 1 table(s)

Found 1 tables in total:

=== TABLE 1 (Page 1, Table #1) ===
                                                          0                                                                                                   1                                                                      2                                                       3                                                                             4                                                                    5                                                                           6                                                                                                               7
                                                     Task 1                                                                                              Task 2                                                                 Task 3  