In [97]:
import fitz  # PyMuPDF
#import pytesseract
import cv2
import numpy as np
import os
import google.generativeai as genai
import glob

In [98]:
# Directly configure the API key here
API_KEY = "AIzaSyAOVRcmchMjVpUxZqH21bPvmOuhxePW3Ps"  # Replace with your Gemini API key

# Configure the Gemini API with the provided API key
genai.configure(api_key=API_KEY)

In [99]:
def extract_images_from_pdf(pdf_path, output_folder):
    """Extract embedded images and render pages with optimized bounding boxes."""
    doc = fitz.open(pdf_path)

    # Create separate folders for different types of images
    embedded_images_folder = os.path.join(output_folder, "embedded_images")
    rendered_images_folder = os.path.join(output_folder, "rendered_images")
    region_images_folder = os.path.join(output_folder, "region_images")

    os.makedirs(embedded_images_folder, exist_ok=True)
    os.makedirs(rendered_images_folder, exist_ok=True)
    os.makedirs(region_images_folder, exist_ok=True)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        print(f"Found {len(image_list)} embedded images on page {page_num + 1}")

        # Extract and save embedded images
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            img_extension = base_image["ext"]
            img_path = os.path.join(embedded_images_folder, f"page_{page_num + 1}_img_{img_index + 1}.{img_extension}")

            with open(img_path, "wb") as f:
                f.write(image_bytes)
            print(f"Saved embedded image: {img_path}")

        # Render page and detect regions
        page_image_path = os.path.join(rendered_images_folder, f"page_{page_num + 1}_rendered.png")
        render_page_as_image(page, page_image_path)

        detect_and_save_regions(page_image_path, region_images_folder, page_num)

    doc.close()

In [100]:
def render_page_as_image(page, output_path, zoom=2):
    """Renders a PDF page to a PNG image."""
    mat = fitz.Matrix(zoom, zoom)  # Higher zoom for better resolution
    pix = page.get_pixmap(matrix=mat)
    pix.save(output_path)
    print(f"Rendered page saved: {output_path}")

In [101]:
def render_page_as_image(page, output_path, zoom=2):
    """Renders a PDF page to a PNG image."""
    mat = fitz.Matrix(zoom, zoom)  # Higher zoom for better resolution
    pix = page.get_pixmap(matrix=mat)
    pix.save(output_path)
    print(f"Rendered page saved: {output_path}")

def detect_and_save_regions(image_path, output_folder, page_num):
    """Detects tables/charts, saves them as separate images, and annotates the page."""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Adaptive thresholding to highlight tables/charts
    thresh = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2
    )

    # Use dilation to connect close components and remove noise
    kernel = np.ones((10, 10), np.uint8)
    dilated = cv2.dilate(thresh, kernel, iterations=2)

    # Find contours
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Prepare the annotated image
    annotated_img = img.copy()

    for idx, contour in enumerate(contours):
        x, y, w, h = cv2.boundingRect(contour)

        # Adjust the size of the bounding box to capture nearby text
        padding = 10  # Increase bounding box size slightly
        x, y = max(x - padding, 0), max(y - padding, 0)
        w, h = min(w + 2 * padding, img.shape[1] - x), min(h + 2 * padding, img.shape[0] - y)

        # Filter out small regions that are likely noise
        if w > 100 and h > 100:  # Adjust size thresholds as needed
            # Draw red bounding box
            cv2.rectangle(annotated_img, (x, y), (x + w, y + h), (0, 0, 255), 2)

            # Save each detected region as a separate image
            detected_region = img[y:y+h, x:x+w]
            region_path = os.path.join(output_folder, f"page_{page_num + 1}_region_{idx + 1}.png")
            cv2.imwrite(region_path, detected_region)
            print(f"Saved detected region: {region_path}")

    # Save the annotated image with all bounding boxes
    annotated_images_folder = os.path.join(output_folder, "annotated_images")
    os.makedirs(annotated_images_folder, exist_ok=True)
    annotated_path = os.path.join(annotated_images_folder, f"page_{page_num + 1}_annotated.png")
    cv2.imwrite(annotated_path, annotated_img)
    print(f"Annotated image saved: {annotated_path}")

In [102]:
def analyze_chart_with_gemini(image_path, api_key="your_actual_api_key"):
    """
    Analyzes a chart image using the Gemini model.
    
    Args:
        image_path (str): The path to the image file to be analyzed.
        api_key (str): Your Gemini API key (default is a placeholder).
        
    Returns:
        str: The description and data points from the chart.
    """
    # Configure the Gemini API with the provided API key
    genai.configure(api_key=api_key)

    def upload_to_gemini(path, mime_type="image/png"):
        """Uploads the specified file to the Gemini model.
        
        Args:
            path (str): The path to the file to be uploaded.
            mime_type (str): The MIME type of the file. Default is 'image/png'.
            
        Returns:
            file: The uploaded file object.
        """
        file = genai.upload_file(path, mime_type=mime_type)
        print(f"Uploaded file '{file.display_name}' as: {file.uri}")
        return file

    # Configuration for the generative model
    generation_config = {
        "temperature": 1,
        "top_p": 0.95,
        "top_k": 64,
        "max_output_tokens": 8192,
        "response_mime_type": "text/plain",
    }

    # Create the generative model instance
    model = genai.GenerativeModel(
        model_name="gemini-1.5-flash",
        generation_config=generation_config,
    )

    # Upload the image file to Gemini
    image_file = upload_to_gemini(image_path, mime_type="image/png")

    # Start a chat session with the model
    chat_session = model.start_chat(
        history=[
            {
                "role": "user",
                "parts": [
                    image_file,  # The uploaded image file
                ],
            },
        ]
    )

    # Send a message to the model for analysis
    response = chat_session.send_message("Describe what the chart is about in one line and provide all the data points.")
    
    return response.text

def process_images_in_folder(folder_path, api_key="your_actual_api_key"):
    """
    Processes all images in the specified folder using the Gemini model.
    
    Args:
        folder_path (str): The path to the folder containing images.
        api_key (str): Your Gemini API key (default is a placeholder).
        
    Returns:
        None
    """
    # Ensure the output folder exists
    if not os.path.exists(folder_path):
        print(f"The folder {folder_path} does not exist.")
        return

    # Loop through all files in the specified folder
    for filename in os.listdir(folder_path):
        # Check if the file is an image (PNG format)
        if filename.lower().endswith('.png'):
            image_path = os.path.join(folder_path, filename)
            print(f"Processing image: {image_path}")
            result = analyze_chart_with_gemini(image_path, api_key=api_key)
            print(f"Result for {filename}: {result}\n")
            with open("gemini_response.txt", "a") as file:
                file.write(f"Result for {filename}: {result}\n")


In [103]:
if __name__ == "__main__":
    pdf_path = "test5.pdf"  # Replace with your PDF path
    output_folder = "extracted_images"

    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Extract and annotate images from the PDF
    extract_images_from_pdf(pdf_path, output_folder)
    # Specify the folder containing the extracted images
    output_folder2 = os.path.join(output_folder, "region_images")

    # Process all images in the region_images folder
    with open("key.txt", "r") as key_file:
        api_key = key_file.read().strip()
    process_images_in_folder(output_folder2, api_key=api_key)

Found 1 embedded images on page 1
Saved embedded image: extracted_images/embedded_images/page_1_img_1.png
Rendered page saved: extracted_images/rendered_images/page_1_rendered.png
Saved detected region: extracted_images/region_images/page_1_region_2.png
Saved detected region: extracted_images/region_images/page_1_region_7.png
Saved detected region: extracted_images/region_images/page_1_region_11.png
Annotated image saved: extracted_images/region_images/annotated_images/page_1_annotated.png
Processing image: extracted_images/region_images/page_1_region_2.png
Uploaded file 'page_1_region_2.png' as: https://generativelanguage.googleapis.com/v1beta/files/gswvz5y6uo2j
Result for page_1_region_2.png: The chart compares the budget and expenditure for different spending categories, with data points as follows: Auto - Budget: 200, Expenditure: 200; Entertainment - Budget: 2000, Expenditure: 2000; Food - Budget: 4000, Expenditure: 1000; Home - Budget: 18000, Expenditure: 18000; Medical - Budget: