In [1]:
import fitz  # PyMuPDF
import pytesseract
import camelot
import tabula
import cv2
import os

In [2]:
# Set up Tesseract executable path if needed
# pytesseract.pytesseract.tesseract_cmd = r'<path_to_your_tesseract_executable>'

# Helper function to save images from PDF
def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    images = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"page_{page_num+1}_img_{img_index+1}.{image_ext}"
            with open(image_filename, "wb") as img_file:
                img_file.write(image_bytes)
            images.append(image_filename)

    return images

In [3]:
# Extract tables from PDF using Camelot and Tabula
def extract_tables_from_pdf(pdf_path):
    tables_camelot = camelot.read_pdf(pdf_path, pages='all', flavor='stream')
    #tables_tabula = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)

    # Combining tables extracted from both libraries
    all_tables = []
    if tables_camelot:
        all_tables.extend(tables_camelot)
    #if tables_tabula:
    #    all_tables.extend(tables_tabula)

    return all_tables


In [4]:
# OCR for chart images
def extract_text_from_image(image_path):
    # Read image using OpenCV
    img = cv2.imread(image_path)
    # Convert image to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Use Tesseract to extract text from image
    text = pytesseract.image_to_string(gray)
    return text

In [5]:
import pandas as pd
# Main function to process PDF
def process_pdf(pdf_path):
    # Step 1: Extract tables from PDF
    tables = extract_tables_from_pdf(pdf_path)
    print("Extracted Tables:")
    for table in tables:
        df = table.df if hasattr(table, 'df') else table
        print(df.head())

    # Step 2: Extract images (for charts) from PDF
    images = extract_images_from_pdf(pdf_path)
    print("\nExtracted Images (for charts):")
    for img in images:
        print(f"Processing {img}...")
        # Perform OCR to get data from chart images
        chart_text = extract_text_from_image(img)
        print(f"Extracted data from chart image {img}:\n{chart_text}")

In [6]:
# Example usage
pdf_path = 'test5.pdf'  # Replace with your PDF file
process_pdf(pdf_path)

Extracted Tables:
         0 1        2       3 4
0             Table 1          
1  Country     Amount          
2    India             150123  
3    China             123494  
4   Russia              43948  

Extracted Images (for charts):
Processing page_1_img_1.png...
Extracted data from chart image page_1_img_1.png:
Amount (%)

17500

15000

12500

10000

7500

5000

2500

Budget vs Expenditure by Category

Type

Mmmm Budget
@@m_ Expenditure

Category


