In [2]:
# Import necessary libraries
import fitz  # PyMuPDF
import pandas as pd
import pytesseract
import json
import os
import numpy as np
import cv2
from PIL import Image
import re
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
import uuid
from io import BytesIO
import warnings
warnings.filterwarnings('ignore')


In [3]:
def extract_metadata(pdf_path):
    """Extract metadata from the PDF file."""
    doc = fitz.open(pdf_path)
    metadata = doc.metadata
    doc.close()
    return metadata

In [4]:
def extract_text_by_page(pdf_path, output_path="extracted_text.txt"):
    """Extract text directly from the PDF page by page."""
    doc = fitz.open(pdf_path)
    all_text = []
    
    for page_num, page in enumerate(doc):
        text = page.get_text("text")
        all_text.append(f"--- Page {page_num + 1} ---\n{text}\n")
    
    full_text = "\n".join(all_text)
    
    # Save to file
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(full_text)
    
    doc.close()
    return full_text

In [5]:
def has_content(img_array):
    """Check if an image has meaningful content based on standard deviation."""
    gray = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY) if len(img_array.shape) == 3 else img_array
    std_dev = np.std(gray)
    return std_dev > 10  # Threshold for meaningful content

In [6]:
def extract_images_and_diagrams(pdf_path, output_folder="extracted_images"):
    """Extract images and diagrams from PDF."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    doc = fitz.open(pdf_path)
    image_count = 0
    image_paths = []
    
    for page_num, page in enumerate(doc):
        print(f"Processing page {page_num+1} for images...")
        
        # Get images from page
        image_list = page.get_images(full=True)
        
        # Extract each image
        for img_index, img_info in enumerate(image_list):
            try:
                xref = img_info[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                
                # Convert bytes to numpy array for analysis
                img = np.array(Image.open(BytesIO(image_bytes)))
                
                # Skip if image doesn't have meaningful content
                if not has_content(img):
                    continue
                
                # Save the image
                image_count += 1
                image_filename = f"{output_folder}/image_p{page_num+1}_{img_index+1}.{image_ext}"
                with open(image_filename, "wb") as img_file:
                    img_file.write(image_bytes)
                image_paths.append(image_filename)
                print(f"  - Saved embedded image: {image_filename}")
            except Exception as e:
                print(f"  - Error extracting image {img_index} on page {page_num+1}: {e}")
        
        # For diagrams and other objects not detected as standard images
        print(f"  Looking for diagrams on page {page_num+1}...")
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # Higher resolution for better OCR
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img_array = np.array(img)
        
        # Use contour detection to find potential diagrams
        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
        thresh = cv2.threshold(blurred, 230, 255, cv2.THRESH_BINARY_INV)[1]
        
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        print(f"  - Found {len(contours)} potential diagram contours")
        
        # Filter contours based on size and shape
        diagram_count = 0
        for i, contour in enumerate(contours):
            area = cv2.contourArea(contour)
            if area < 10000:  # Skip small contours
                continue
                
            # Get bounding rectangle
            x, y, w, h = cv2.boundingRect(contour)
            
            # Skip if rectangle is too large (likely the whole page)
            if w > pix.width * 0.9 or h > pix.height * 0.9:
                continue
                
            # Extract potential diagram
            diagram = img_array[y:y+h, x:x+w]
            
            # Skip if low content variation (likely a solid area)
            if not has_content(diagram):
                continue
            
            # Save diagram
            diagram_img = Image.fromarray(diagram)
            diagram_filename = f"{output_folder}/diagram_p{page_num+1}_{i+1}.png"
            diagram_img.save(diagram_filename)
            image_paths.append(diagram_filename)
            diagram_count += 1
        
        print(f"  - Saved {diagram_count} diagrams from page {page_num+1}")
    
    print(f"Total images and diagrams extracted: {len(image_paths)}")
    doc.close()
    return image_paths

In [7]:
def is_table_like(text_block):
    """Check if a text block resembles a table structure."""
    # Check for structured rows (consistent patterns of spaces/tabs)
    lines = text_block.strip().split('\n')
    if len(lines) < 3:  # Too few lines to be a table
        return False
    
    # Check for delimiter patterns (multiple spaces, tabs, or |)
    delim_patterns = [re.compile(r'\s{3,}'), re.compile(r'\t+'), re.compile(r'\|')]
    delim_matches = [any(pattern.search(line) for pattern in delim_patterns) for line in lines]
    
    # If most lines have delimiter patterns, might be a table
    if sum(delim_matches) / len(lines) > 0.7:
        return True
    
    # Check for aligned columns (words starting at consistent positions)
    positions = []
    for line in lines:
        word_positions = [match.start() for match in re.finditer(r'\b\w+', line)]
        if word_positions:
            positions.append(word_positions)
    
    if positions:
        # Count how many positions are common across lines
        position_counts = {}
        for pos_list in positions:
            for pos in pos_list:
                position_counts[pos] = position_counts.get(pos, 0) + 1
        
        # If we have consistent starting positions across many lines, likely a table
        consistent_positions = sum(1 for count in position_counts.values() if count > len(lines) * 0.5)
        if consistent_positions >= 2:  # At least 2 aligned columns
            return True
    
    return False

def preprocess_table_text(text):
    """Preprocess text to better extract tables."""
    # Replace multiple spaces with a single delimiter for parsing
    text = re.sub(r'\s{3,}', '|', text)
    # Replace tabs with delimiter
    text = re.sub(r'\t+', '|', text)
    # Clean up extra spaces around delimiters
    text = re.sub(r'\s*\|\s*', '|', text)
    return text

In [8]:
def parse_text_to_table(text):
    """Parse table-like text into a pandas DataFrame."""
    lines = text.strip().split('\n')
    rows = []
    
    # Convert text lines to list of values
    for line in lines:
        # Skip empty lines
        if not line.strip():
            continue
        
        # Different parsing strategies
        if '|' in line:
            # Pipe-delimited
            row = [cell.strip() for cell in line.split('|')]
        else:
            # Space-aligned table
            # First, identify consistent column positions by finding clusters of word starts
            all_positions = []
            for l in lines:
                word_positions = [match.start() for match in re.finditer(r'\b\w+', l)]
                all_positions.extend(word_positions)
            
            # Cluster positions (treat positions within 2 chars as the same column)
            clusters = []
            for pos in sorted(all_positions):
                found = False
                for i, cluster in enumerate(clusters):
                    if abs(cluster - pos) <= 2:
                        clusters[i] = min(cluster, pos)  # Use leftmost position
                        found = True
                        break
                if not found:
                    clusters.append(pos)
            
            # Use positions to split the line
            row = []
            clusters = sorted(clusters)
            for i in range(len(clusters)):
                start = clusters[i]
                end = clusters[i+1] if i+1 < len(clusters) else len(line)
                if start < len(line):
                    cell = line[start:end].strip()
                    row.append(cell)
        
        rows.append(row)
    
    # Ensure all rows have the same number of columns
    max_cols = max(len(row) for row in rows) if rows else 0
    for i in range(len(rows)):
        while len(rows[i]) < max_cols:
            rows[i].append("")
    
    # Convert to DataFrame
    df = pd.DataFrame(rows)
    
    # If first row looks like header (different pattern or all strings)
    if len(rows) > 1:
        # Check if first row has different formatting
        first_row_numeric = sum(1 for cell in rows[0] if cell.strip() and cell.strip().replace('.', '', 1).isdigit())
        other_rows_numeric = sum(1 for row in rows[1:] for cell in row if cell.strip() and cell.strip().replace('.', '', 1).isdigit())
        
        if (first_row_numeric / max(1, len([c for c in rows[0] if c.strip()]))) < 0.3 and \
           (other_rows_numeric / max(1, sum(1 for row in rows[1:] for c in row if c.strip()))) > 0.3:
            # First row is likely a header
            df.columns = df.iloc[0]
            df = df.iloc[1:]
    
    # Reset index
    df = df.reset_index(drop=True)
    return df

In [9]:
def extract_tables_from_text(text):
    """Extract tables from text content by identifying table-like structures."""
    tables = []
    lines = text.split('\n')
    
    i = 0
    while i < len(lines):
        # Look for a sequence of lines that might form a table
        table_start = i
        while i < len(lines) and not lines[i].strip():
            i += 1  # Skip empty lines
        
        table_text = []
        while i < len(lines) and lines[i].strip():
            table_text.append(lines[i])
            i += 1
        
        if table_text:
            text_block = '\n'.join(table_text)
            if is_table_like(text_block):
                processed_text = preprocess_table_text(text_block)
                table_df = parse_text_to_table(processed_text)
                
                # Only add if it looks like a real table (at least 2x2)
                if table_df.shape[0] >= 2 and table_df.shape[1] >= 2:
                    tables.append({
                        'page_range': f"{table_start}-{i}",
                        'table': table_df
                    })
        
        i += 1  # Move to next line
    
    return tables

In [10]:
def extract_tables_with_ocr(pdf_path):
    """Extract tables using PyMuPDF and augmented with OCR."""
    doc = fitz.open(pdf_path)
    tables = []
    
    for page_num, page in enumerate(doc):
        print(f"Processing page {page_num+1} for tables...")
        
        # Get text directly from PDF
        text = page.get_text("text")
        
        # Try to extract tables from regular text
        text_tables = extract_tables_from_text(text)
        for table_info in text_tables:
            table_info['page'] = page_num + 1
            tables.append(table_info)
        
        print(f"  - Found {len(text_tables)} tables from text on page {page_num+1}")
        
        # Use OCR for additional table detection
        print(f"  - Using OCR to detect additional tables...")
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # Higher resolution
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img_array = np.array(img)
        
        # Use OpenCV to detect table-like structures
        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
        edges = cv2.Canny(gray, 50, 150, apertureSize=3)
        
        # Detect lines that might form table structure
        lines = cv2.HoughLinesP(edges, 1, np.pi/180, 100, minLineLength=100, maxLineGap=10)
        
        if lines is not None and len(lines) > 10:  # Enough lines might indicate a table
            # Use OCR on the page to get potential table content
            print(f"  - Found {len(lines)} lines, running OCR for tables...")
            ocr_text = pytesseract.image_to_string(img)
            ocr_tables = extract_tables_from_text(ocr_text)
            
            for table_info in ocr_tables:
                table_info['page'] = page_num + 1
                table_info['source'] = 'ocr'
                tables.append(table_info)
            
            print(f"  - Found {len(ocr_tables)} additional tables with OCR")
        else:
            print(f"  - No significant table structure detected with OpenCV")
    
    print(f"Total tables extracted: {len(tables)}")
    doc.close()
    return tables

In [19]:
import openpyxl
import re

def clean_excel_value(value):
    """Remove illegal characters that cannot be written to an Excel file."""
    if isinstance(value, str):
        ILLEGAL_CHARACTERS_RE = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F]')  # Matches non-printable ASCII
        return ILLEGAL_CHARACTERS_RE.sub("", value)  # Remove illegal characters
    return value  # Return non-string values as is



def save_tables_to_excel(tables, output_file="extracted_tables.xlsx"):
    """Save tables to Excel with each table on its own sheet."""
    if not tables:
        print("No tables to save.")
        return 0
        
    wb = openpyxl.Workbook()
    # Remove default sheet
    if "Sheet" in wb.sheetnames:
        del wb["Sheet"]
    
    # Group tables by page
    tables_by_page = {}
    for table_info in tables:
        page = table_info.get('page', 'unknown')
        if page not in tables_by_page:
            tables_by_page[page] = []
        tables_by_page[page].append(table_info)
    
    # Create sheets and populate with tables
    for page, page_tables in tables_by_page.items():
        for i, table_info in enumerate(page_tables):
            sheet_name = f"Page{page}_Table{i+1}"
            if len(sheet_name) > 31:  # Excel has 31 char limit for sheet names
                sheet_name = f"P{page}_T{i+1}_{uuid.uuid4().hex[:5]}"
            
            ws = wb.create_sheet(sheet_name)
            
            # Write table to sheet
            df = table_info['table']
            for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True)):
                for c_idx, value in enumerate(row):
                    clean_value = clean_excel_value(value)  # Clean illegal characters
                    ws.cell(row=r_idx+1, column=c_idx+1, value=clean_value)

                
    wb.save(output_file)
    print(f"Saved {len(tables)} tables to {output_file}")
    return len(tables)

In [13]:
def apply_ocr_to_images(image_paths, output_file="ocr_from_images.txt"):
    """Apply OCR to extracted images."""
    if not image_paths:
        print("No images to process for OCR.")
        return ""
        
    ocr_text = []
    for img_path in image_paths:
        try:
            print(f"Applying OCR to {os.path.basename(img_path)}...")
            img = Image.open(img_path)
            text = pytesseract.image_to_string(img)
            if text.strip():
                ocr_text.append(f"--- OCR from {os.path.basename(img_path)} ---")
                ocr_text.append(text)
                print(f"  - Extracted {len(text.split())} words")
            else:
                print(f"  - No text found")
        except Exception as e:
            print(f"  - Error OCR-ing image {img_path}: {e}")
    
    # Save OCR text
    full_ocr_text = "\n\n".join(ocr_text)
    if full_ocr_text:
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(full_ocr_text)
        print(f"Saved OCR text to {output_file}")
    else:
        print("No OCR text to save")
    
    return full_ocr_text

In [14]:
def combine_text_sources(pdf_text, ocr_text, output_file="combined_text.txt"):
    """Combine regular PDF text extraction with OCR results."""
    combined = pdf_text + "\n\n--- OCR EXTRACTED TEXT ---\n\n" + ocr_text
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(combined)
    print(f"Combined text saved to {output_file}")
    return combined

In [15]:
def process_pdf(pdf_path, output_folder="pdf_extraction_results"):
    """Process a PDF and extract all possible content."""
    # Create output directory if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Set output paths
    text_output = os.path.join(output_folder, "extracted_text.txt")
    ocr_output = os.path.join(output_folder, "ocr_text.txt")
    combined_output = os.path.join(output_folder, "combined_text.txt")
    tables_output = os.path.join(output_folder, "extracted_tables.xlsx")
    images_output_folder = os.path.join(output_folder, "images")
    
    # Step 1: Extract metadata
    print("\n1. Extracting metadata...")
    metadata = extract_metadata(pdf_path)
    print(f"Metadata: {metadata}")
    
    # Step 2: Extract text
    print("\n2. Extracting text...")
    pdf_text = extract_text_by_page(pdf_path, text_output)
    print(f"Extracted {len(pdf_text.split())} words, saved to {text_output}")
    
    # Step 3: Extract images and diagrams
    print("\n3. Extracting images and diagrams...")
    image_paths = extract_images_and_diagrams(pdf_path, images_output_folder)
    
    # Step 4: Apply OCR to images
    print("\n4. Applying OCR to extracted images...")
    images_ocr_text = apply_ocr_to_images(image_paths, ocr_output)
    
    # Step 5: Combine text sources
    print("\n5. Combining text sources...")
    combined_text = combine_text_sources(pdf_text, images_ocr_text, combined_output)
    
    # Step 6: Extract tables
    print("\n6. Extracting tables...")
    tables = extract_tables_with_ocr(pdf_path)
    
    # Step 7: Save tables to Excel
    print("\n7. Saving tables to Excel...")
    table_count = save_tables_to_excel(tables, tables_output)
    
    # Print summary
    print("\n=== EXTRACTION SUMMARY ===")
    print(f"Total PDF pages: {fitz.open(pdf_path).page_count}")
    print(f"Text extraction: {len(pdf_text.split())} words")
    print(f"Images extracted: {len(image_paths)}")
    print(f"OCR text: {len(images_ocr_text.split()) if images_ocr_text else 0} words")
    print(f"Tables extracted: {table_count}")
    print(f"All results saved to: {output_folder}")
    
    return {
        "metadata": metadata,
        "text_path": text_output,
        "combined_text_path": combined_output,
        "images_count": len(image_paths),
        "tables_count": table_count
    }

In [20]:
# Specify the path to your PDF file
pdf_path = r"E:\Btech_AI\Intern\ocrpro\Phable CAM Final.pdf"  # Replace with your actual PDF path

# Process the PDF
results = process_pdf(pdf_path)


1. Extracting metadata...
Metadata: {'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'iOS Version 15.1 (Build 19B74) Quartz PDFContext', 'creationDate': "D:20211206105506Z00'00'", 'modDate': "D:20211206105506Z00'00'", 'trapped': '', 'encryption': None}

2. Extracting text...
Extracted 10618 words, saved to pdf_extraction_results\extracted_text.txt

3. Extracting images and diagrams...
Processing page 1 for images...
  - Saved embedded image: pdf_extraction_results\images/image_p1_1.png
  - Saved embedded image: pdf_extraction_results\images/image_p1_2.png
  Looking for diagrams on page 1...
  - Found 13 potential diagram contours
  - Saved 0 diagrams from page 1
Processing page 2 for images...
  - Saved embedded image: pdf_extraction_results\images/image_p2_1.png
  - Saved embedded image: pdf_extraction_results\images/image_p2_2.png
  - Saved embedded image: pdf_extraction_results\images/image_p2_3.jpeg
  Looking for diagrams o