#pre requistes to check:
1.Enable these APIs:

• Vertex AI API

• Cloud Vision API


| Package                   | Install Command                                 |
| ------------------------- | ----------------------------------------------- |
| `google-cloud-vision`     | `pip install --upgrade google-cloud-vision`     |
| `google-cloud-aiplatform` | `pip install --upgrade google-cloud-aiplatform` |
| `pdf2image`               | `pip install pdf2image`                         |
| `poppler-utils` (Linux)   | `sudo apt-get install poppler-utils`            |
| `poppler` (Mac)           | `brew install poppler`                          |
| `PIL` (Pillow)            | Comes with `pdf2image`                          |


| Use case                | Input to Gemini        | When to use                                                       |
| ----------------------- | ---------------------- | ----------------------------------------------------------------- |
| Text-based extraction   | OCR text only          | When OCR text is clean, simple layout, or you only have text      |
| Layout-aware extraction | OCR text + image bytes | When visual layout/formatting is crucial for structure extraction |


#Method 1: Using Gemini 1.0 pro

In [None]:
#update the folowing things in the below code:
#Replace "your-gcp-project-id" and "us-central1" with your GCP project and region

#Replace "your-gcs-bucket-name" and "path/to/your_file.pdf" with your GCS bucket and PDF path

In [None]:
# Step 1: Install Required Libraries
!pip install --upgrade google-cloud-vision google-cloud-aiplatform pdf2image
!sudo apt-get update
!sudo apt-get install -y poppler-utils

# Step 2: Set GCP Configuration
from google.cloud import aiplatform, storage
from google.cloud import vision
from vertexai.language_models import TextGenerationModel
from pdf2image import convert_from_bytes
from PIL import Image
import io
import json

# Replace with your actual GCP project and region
GCP_PROJECT = "your-gcp-project-id"
GCP_REGION = "us-central1"
aiplatform.init(project=GCP_PROJECT, location=GCP_REGION)

# Step 3: Download PDF from GCS and convert to images (in-memory)
def pdf_to_images_from_gcs(bucket_name, blob_name, dpi=300):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    # Download PDF bytes into memory (no local file)
    pdf_bytes = blob.download_as_bytes()
    print(f"Downloaded PDF bytes from gs://{bucket_name}/{blob_name}")

    # Convert PDF bytes directly to images (no local storage)
    images = convert_from_bytes(pdf_bytes, dpi=dpi)
    print(f"Converted PDF to {len(images)} images")
    return images

# Step 4: Extract OCR text from each image
def extract_ocr_from_image(image: Image.Image):
    client = vision.ImageAnnotatorClient()
    with io.BytesIO() as output:
        image.save(output, format='PNG')
        image_bytes = output.getvalue()

    vision_image = vision.Image(content=image_bytes)
    response = client.document_text_detection(image=vision_image)

    if response.error.message:
        raise Exception(f"Vision API Error: {response.error.message}")

    return response.full_text_annotation

# Step 5: Flatten OCR annotation to plain text
def flatten_ocr_text(annotation):
    lines = []
    for page in annotation.pages:
        for block in page.blocks:
            for para in block.paragraphs:
                line = ""
                for word in para.words:
                    word_text = ''.join([symbol.text for symbol in word.symbols])
                    line += word_text + " "
                lines.append(line.strip())
    return "\n".join(lines)

# Step 6: Call Gemini 1.0 Pro text-only model to extract structured data
def extract_entities_with_gemini_text_model(ocr_text):
    prompt = f"""
You are given OCR-extracted text from a PDF page.

Your task:
1. Identify all headers, sub-headers, and any dates.
2. Extract the content that appears under each.
3. Return a structured JSON in this format:

[
  {{
    "header": "Header text",
    "sub_headers": [
      {{
        "sub_header": "Sub-header text",
        "date": "Date (if any)",
        "content": "Associated content"
      }}
    ]
  }}
]

OCR Text:
\"\"\"
{ocr_text}
\"\"\"
"""
    model = TextGenerationModel.from_pretrained("gemini-1.0-pro")
    response = model.predict(
        prompt=prompt,
        temperature=0.2,
        max_output_tokens=2048
    )
    return response.text

# Step 7: Full pipeline that processes PDF from GCS bucket
def process_pdf_gcs(bucket_name, blob_name):
    images = pdf_to_images_from_gcs(bucket_name, blob_name)
    results = []

    for i, image in enumerate(images):
        print(f" Processing page {i + 1}/{len(images)}...")

        ocr_annotation = extract_ocr_from_image(image)
        ocr_text = flatten_ocr_text(ocr_annotation)
        gemini_output = extract_entities_with_gemini_text_model(ocr_text)

        results.append({
            "page": i + 1,
            "ocr_text": ocr_text,
            "structured_output": gemini_output
        })

    return results

# Step 8: Run the pipeline with your GCS bucket and file path
bucket_name = "your-bucket-name"
blob_name = "path/to/your_file.pdf"

results = process_pdf_gcs(bucket_name, blob_name)

# Step 9: Display the output neatly
for r in results:
    print(f"\n--- Page {r['page']} ---")
    try:
        structured = json.loads(r['structured_output'])
        print(json.dumps(structured, indent=2))
    except Exception as e:
        print("Could not parse JSON. Raw output:")
        print(r['structured_output'])


# Option 2: Using the Mulimodal LLM

In [None]:
#Other alternative
#Use Gemini 1.5 Pro Multimodal (image + OCR layout)
# This lets Gemini LLM process:

# The original image/PDF page

# The Vision OCR result (including bounding boxes)

# Instructions to identify headers, sub-headers, dates, and associated content

In [None]:
#update the folowing things in the below code:
#Replace "your-gcp-project-id" and "us-central1" with your GCP project and region

#Replace "your-gcs-bucket-name" and "path/to/your_file.pdf" with your GCS bucket and PDF path

In [None]:
# Step 0: Install dependencies (run once in your environment)
!pip install --upgrade google-cloud-storage google-cloud-vision google-cloud-aiplatform pdf2image
!sudo apt-get update && sudo apt-get install -y poppler-utils

from google.cloud import storage, vision
from google.cloud import aiplatform
from vertexai.preview.generative_models import GenerativeModel, Part
from pdf2image import convert_from_bytes
from PIL import Image
import io
import json

# ----------- GCP PROJECT CONFIG -----------
GCP_PROJECT = "your-gcp-project-id"   # Replace with your project id
GCP_LOCATION = "us-central1"           # Your region

# Initialize Vertex AI for Gemini model usage
aiplatform.init(project=GCP_PROJECT, location=GCP_LOCATION)

# Initialize global clients
storage_client = storage.Client()
vision_client = vision.ImageAnnotatorClient()

# ----------- Helper functions -----------

def pdf_gcs_to_images(bucket_name: str, blob_name: str, dpi: int = 300):
    """
    Download PDF from GCS as bytes, convert to images in-memory.
    """
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    pdf_bytes = blob.download_as_bytes()
    print(f"Downloaded PDF from gs://{bucket_name}/{blob_name}")

    images = convert_from_bytes(pdf_bytes, dpi=dpi)
    print(f"Converted PDF to {len(images)} images")

    return images


def extract_ocr_from_image(image: Image.Image):
    """
    Run Google Vision OCR on PIL Image, return annotation and image bytes.
    """
    with io.BytesIO() as output:
        image.save(output, format="PNG")
        image_bytes = output.getvalue()

    vision_image = vision.Image(content=image_bytes)
    response = vision_client.document_text_detection(image=vision_image)

    if response.error.message:
        raise Exception(f"Vision API Error: {response.error.message}")

    return response.full_text_annotation, image_bytes


def flatten_ocr_text(annotation):
    """
    Convert Vision OCR annotation into plain text string.
    """
    lines = []
    for page in annotation.pages:
        for block in page.blocks:
            for para in block.paragraphs:
                line = ""
                for word in para.words:
                    word_text = ''.join([symbol.text for symbol in word.symbols])
                    line += word_text + " "
                lines.append(line.strip())
    return "\n".join(lines)


def extract_entities_with_gemini(image_bytes, ocr_text):
    """
    Send image bytes and OCR text to Gemini Multimodal LLM and get structured JSON response.
    """
    prompt = """
You are given a scanned page of a PDF and its OCR-extracted text.

Your task:
1. Identify all headers, sub-headers, and any dates.
2. Extract the text content that appears under each section.
3. Use layout structure (like spacing, font size, boldness) to group content.
4. Return structured JSON like:

[
  {
    "header": "Header text",
    "sub_headers": [
      {
        "sub_header": "Sub-header text",
        "date": "Date (if any)",
        "content": "Associated content text"
      }
    ]
  }
]
"""

    model = GenerativeModel("gemini-1.5-pro-preview-0409")

    response = model.generate_content(
        [
            prompt,
            Part.from_data(data=image_bytes, mime_type="image/png"),
            Part.from_text(ocr_text)
        ],
        generation_config={"temperature": 0.2, "max_output_tokens": 2048}
    )
    return response.text


def process_pdf_from_gcs(bucket_name, blob_name):
    """
    Full pipeline: Load PDF from GCS, convert pages to images, run OCR, call Gemini LLM,
    and return structured results.
    """
    images = pdf_gcs_to_images(bucket_name, blob_name)
    results = []

    for i, image in enumerate(images):
        print(f"Processing page {i + 1} of {len(images)}")

        ocr_annotation, image_bytes = extract_ocr_from_image(image)
        ocr_text = flatten_ocr_text(ocr_annotation)
        gemini_output = extract_entities_with_gemini(image_bytes, ocr_text)

        results.append({
            "page": i + 1,
            "ocr_text": ocr_text,
            "structured_output": gemini_output
        })

    return results


# ----------- Run Example -----------

if __name__ == "__main__":
    bucket_name = "your-gcs-bucket-name"   # Replace with your bucket name
    blob_name = "path/to/your_file.pdf"    # Replace with your PDF path in bucket

    all_results = process_pdf_from_gcs(bucket_name, blob_name)

    for result in all_results:
        print(f"\n--- 📄 Page {result['page']} ---")
        try:
            parsed_json = json.loads(result['structured_output'])
            print(json.dumps(parsed_json, indent=2))
        except json.JSONDecodeError:
            print("Could not parse JSON output, raw text:")
            print(result['structured_output'])
