In [None]:

import os
import sys
import numpy as np
import cv2
import pandas as pd
import pytesseract
import camelot
from pdf2image import convert_from_path
import layoutparser as lp
import PyPDF2


from doctr.models

SyntaxError: invalid syntax (4209221127.py, line 13)

In [None]:
#!/usr/bin/env python3
"""
Advanced PDF Table Extraction Pipeline with Advanced Table Structure Parsing

This script provides a complete pipeline to extract tables from PDF documents.
It first detects if the PDF is digital (embedded text) or scanned (image–based) and
then applies one of two extraction strategies:

For Digital PDFs:
    - Uses Camelot (with the “stream” flavor) to extract tables.

For Scanned PDFs:
    - Converts PDF pages to images using pdf2image.
    - Uses LayoutParser (with a PubLayNet pre-trained model) to detect table regions.
    - For each table region, the image is preprocessed and then an advanced table–structure
      parser is applied. This parser uses Tesseract’s TSV output to reconstruct the table
      layout, grouping recognized words into rows and columns based on their bounding boxes.

Requirements:
    - python3
    - camelot-py[cv]
    - pdf2image
    - layoutparser
    - pytesseract
    - opencv-python
    - PyPDF2
    - pandas
    - numpy
    - Tesseract OCR installed and available in your PATH

Install the Python packages with:
    pip install camelot-py[cv] pdf2image layoutparser pytesseract opencv-python PyPDF2 pandas numpy

Make sure to install Tesseract OCR (see https://github.com/tesseract-ocr/tesseract) and configure LayoutParser’s Detectron2 environment.

Usage:
    python pdf_table_extractor.py path/to/your/document.pdf --output_dir tables_output --dpi 300
"""


def is_digital_pdf(pdf_path, threshold=100):
    """
    Determine if a PDF is digital (has extractable text) by reading the first few pages.
    Returns True if digital; otherwise, returns False (likely scanned).
    """
    text_content = ""
    try:
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            for i, page in enumerate(reader.pages):
                page_text = page.extract_text()
                if page_text:
                    text_content += page_text
                if i >= 2:  # sample first 3 pages
                    break
    except Exception as e:
        print(f"Error reading PDF for digital detection: {e}")
        return False
    return len(text_content.strip()) > threshold

def extract_tables_digital(pdf_path, pages="1-end"):
    """
    Extract tables from a digital PDF using Camelot.
    Returns a list of Camelot Table objects.
    """
    print("Extracting tables using Camelot...")
    try:
        tables = camelot.read_pdf(pdf_path, pages=pages, flavor="stream")
        print(f"Found {len(tables)} table(s).")
        return tables
    except Exception as e:
        print(f"Error extracting tables with Camelot: {e}")
        return []

def advanced_table_structure_parser(image, tesseract_config=r'--oem 3 --psm 6', row_threshold=10, col_threshold=10):
    """
    Advanced table structure parser that uses Tesseract's TSV output to reconstruct the table layout.
    
    Parameters:
        image              : Preprocessed image (as a NumPy array) of the table region.
        tesseract_config   : Configuration string for Tesseract OCR.
        row_threshold      : Pixel threshold to group words in the same row.
        col_threshold      : Pixel threshold to group words in the same column.
    
    Returns:
        A Pandas DataFrame representing the reconstructed table, or None if no text is found.
    """
    # Obtain detailed OCR data (including bounding boxes) as a DataFrame.
    ocr_data = pytesseract.image_to_data(image, config=tesseract_config, output_type=pytesseract.Output.DATAFRAME)
    
    # Drop rows with missing or empty text.
    ocr_data = ocr_data.dropna(subset=['text'])
    ocr_data = ocr_data[ocr_data['text'].str.strip() != '']
    if ocr_data.empty:
        return None

    # Sort by the 'top' coordinate to group words vertically.
    ocr_data = ocr_data.sort_values(by='top').reset_index(drop=True)
    
    # Cluster words into rows based on the 'top' coordinate.
    rows = []
    current_row = []
    current_top = None
    for idx, word in ocr_data.iterrows():
        if current_top is None:
            current_top = word['top']
            current_row.append(word)
        else:
            if abs(word['top'] - current_top) <= row_threshold:
                current_row.append(word)
            else:
                rows.append(pd.DataFrame(current_row))
                current_row = [word]
                current_top = word['top']
    if current_row:
        rows.append(pd.DataFrame(current_row))
    
    # Determine column boundaries using all 'left' coordinates in the table.
    all_lefts = sorted(ocr_data['left'].tolist())
    col_boundaries = []
    current_group = [all_lefts[0]]
    for x in all_lefts[1:]:
        if abs(x - current_group[-1]) <= col_threshold:
            current_group.append(x)
        else:
            col_boundaries.append(np.mean(current_group))
            current_group = [x]
    if current_group:
        col_boundaries.append(np.mean(current_group))
    col_boundaries = sorted(col_boundaries)
    
    # For each row, assign words to the nearest column boundary.
    table_data = []
    for row_df in rows:
        row_df = row_df.sort_values(by='left')
        row_cells = [''] * len(col_boundaries)
        for _, word in row_df.iterrows():
            # Determine the nearest column by comparing the 'left' coordinate.
            distances = [abs(word['left'] - b) for b in col_boundaries]
            col_idx = int(np.argmin(distances))
            if row_cells[col_idx]:
                row_cells[col_idx] += ' ' + word['text']
            else:
                row_cells[col_idx] = word['text']
        table_data.append(row_cells)
    
    table_df = pd.DataFrame(table_data)
    return table_df

def extract_tables_scanned(pdf_path, dpi=300, tesseract_config=r'--oem 3 --psm 6'):
    """
    Extract tables from a scanned PDF using LayoutParser for table detection and an advanced
    table–structure parser for OCR reconstruction.
    
    Returns a list of Pandas DataFrame objects representing the detected tables.
    """
    print("Converting PDF pages to images...")
    try:
        images = convert_from_path(pdf_path, dpi=dpi)
    except Exception as e:
        print(f"Error converting PDF to images: {e}")
        return []
    
    # Initialize LayoutParser's table detection model (PubLayNet pre-trained model)
    model = lp.Detectron2LayoutModel(
       config_path='lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
       model_path='lp://PubLayNet/faster_rcnn_R_50_FPN_3x/model',
       extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
       label_map={3: "Table"}
    )
    
    extracted_tables = []
    for page_num, image in enumerate(images, start=1):
        print(f"Processing page {page_num}...")
        image_np = np.array(image)
        layout = model.detect(image_np)
        # Filter for detected table regions.
        table_blocks = [block for block in layout if block.type == 'Table']
        print(f"Detected {len(table_blocks)} table region(s) on page {page_num}.")
        
        for idx, block in enumerate(table_blocks, start=1):
            x1, y1, x2, y2 = map(int, block.coordinates)
            cropped = image_np[y1:y2, x1:x2]
            
            # Preprocess the cropped region for OCR: convert to grayscale and apply thresholding.
            gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
            _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            
            # Use the advanced table–structure parser to reconstruct the table layout.
            table_df = advanced_table_structure_parser(thresh, tesseract_config=tesseract_config)
            if table_df is not None:
                extracted_tables.append(table_df)
                print(f"Extracted advanced table {idx} on page {page_num} with shape {table_df.shape}.")
            else:
                print(f"Advanced parser failed to extract table region {idx} on page {page_num}.")
    return extracted_tables

def save_tables(tables, output_dir, prefix="table"):
    """
    Save the extracted tables as CSV files in the specified output directory.
    The input `tables` can be a list of Camelot Table objects or Pandas DataFrames.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for idx, table in enumerate(tables, start=1):
        # For Camelot table objects, use the 'df' attribute.
        if hasattr(table, 'df'):
            df = table.df
        elif isinstance(table, pd.DataFrame):
            df = table
        else:
            continue
        output_path = os.path.join(output_dir, f"{prefix}_{idx}.csv")
        df.to_csv(output_path, index=False)
        print(f"Saved table {idx} to {output_path}")

def main():
    pdf_path = "input/mml-book.pdf"
    output_dir = "output"
    
    if not os.path.exists(pdf_path):
        print(f"PDF file {pdf_path} does not exist.")
        sys.exit(1)
    
    if is_digital_pdf(pdf_path):
        print("Digital PDF detected. Using Camelot for table extraction.")
        tables = extract_tables_digital(pdf_path)
    else:
        print("Scanned PDF detected. Using LayoutParser and advanced table structure parser for table extraction.")
        tables = extract_tables_scanned(pdf_path, dpi=300)
    
    if not tables:
        print("No tables were extracted.")
    else:
        save_tables(tables, output_dir)
        print("Table extraction completed.")

if __name__ == "__main__":
    main()

In [5]:
result = pd.read_csv('output/table_8.csv')

In [6]:
result.head()  

Unnamed: 0,0,1
0,4,Foreword
1,,Acknowledgments
2,We are grateful to many people who looked at e...,
3,suffered through painful expositions of concep...,
4,their ideas that we did not vehemently disagre...,


In [None]:
from table-transformer-main