In [None]:
#install dependency

# Install Tesseract and Poppler
!sudo apt update
!sudo apt install -y tesseract-ocr poppler-utils


In [None]:
# #optional:Configure Tesseract Path if Needed
# import pytesseract
# pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'


In [None]:
!pip install pytesseract pdf2image pillow

In [None]:
!pip install google-cloud-aiplatform

In [None]:
import pytesseract
from pdf2image import convert_from_path
import re
from collections import defaultdict
import json
import os
# Core Vertex AI SDK
from google.cloud import aiplatform
# For working with Gemini (Text Generation) models
from vertexai.language_models import TextGenerationModel

# Initialize Vertex AI client
aiplatform.init(
    project="your-gcp-project-id",     # Replace with your actual project ID
    location="us-central1"             # Use supported region like "us-central1"
)


def extract_text_from_pdf(pdf_path):
    pages = convert_from_path(pdf_path)
    all_lines = []

    for page_num, page in enumerate(pages):
        ocr_data = pytesseract.image_to_data(page, output_type=pytesseract.Output.DICT)
        n_boxes = len(ocr_data['text'])

        line_map = defaultdict(lambda: {
            "text": "",
            "words": [],
            "bbox": [],
            "page": page_num + 1,
            "block_num": None,
            "par_num": None,
            "line_num": None
        })

        for i in range(n_boxes):
            word = ocr_data['text'][i].strip()
            conf = int(ocr_data['conf'][i])
            if not word or conf < 0:
                continue

            block_num = ocr_data['block_num'][i]
            par_num = ocr_data['par_num'][i]
            line_num = ocr_data['line_num'][i]

            line_id = (block_num, par_num, line_num)

            left = ocr_data['left'][i]
            top = ocr_data['top'][i]
            width = ocr_data['width'][i]
            height = ocr_data['height'][i]
            bottom = top + height

            bbox = (left, top, left + width, bottom)

            # Add to line map
            line = line_map[line_id]
            line["text"] += word + ' '
            line["block_num"] = block_num
            line["par_num"] = par_num
            line["line_num"] = line_num
            line["bbox"].append(bbox)

            line["words"].append({
                "word": word,
                "bbox": bbox,
                "conf": conf,
                "index": i,
                "superscript": None  # To be determined after
            })

        # Superscript detection based on baseline average
        for line in line_map.values():
            if not line["words"]:
                continue
            avg_top = sum(b[1] for b in line["bbox"]) / len(line["bbox"])
            for word_data in line["words"]:
                top = word_data["bbox"][1]
                height = word_data["bbox"][3] - word_data["bbox"][1]
                # Superscript if higher than baseline and short height
                word_data["superscript"] = top < (avg_top - height * 0.3)

        for line in line_map.values():
            all_lines.append({
                "text": line["text"].strip(),
                "words": line["words"],
                "bbox": line["bbox"],
                "page": line["page"],
                "block_num": line["block_num"],
                "par_num": line["par_num"],
                "line_num": line["line_num"]
            })

    return all_lines

def extract_dates(line_text):
    date_pattern = r'\b(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{2}[-/]\d{2}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4})\b'
    return re.findall(date_pattern, line_text)

def is_probable_footnote(line_text):
    return (
        re.match(r'^\(?\d{1,2}\)?[\.\s]', line_text) or
        len(line_text.strip()) < 30 and bool(re.search(r'\d{4}', line_text)) or
        any(kw in line_text.lower() for kw in ['note:', 'ref.', 'copyright', 'source:', 'doi'])
    )

def is_header_like(line_text):
    line_text = line_text.strip()
    if not line_text or len(line_text) > 150:
        return False
    has_numbering = re.match(r'^(\d+(\.\d+)*[\)\.]?)\s+', line_text)
    has_superscript = re.match(r'^[¹²³⁴⁵⁶⁷⁸⁹⁰]+\s*', line_text)
    is_caps = line_text.isupper()
    is_title = line_text.istitle()
    return has_numbering or has_superscript or is_caps or is_title

def group_entities_and_context(lines):
    blocks = []
    current_block = None
    found_entity = False

    for line_data in lines:
        stripped = line_data["text"].strip()
        if not stripped:
            continue

        if is_probable_footnote(stripped):
            continue

        dates = extract_dates(stripped)
        is_header = is_header_like(stripped)

        if dates:
            found_entity = True
            current_block = {
                "type": "Date",
                "value": dates[0],
                "raw": stripped,
                "context": [],
                "bbox": line_data["bbox"],
                "page": line_data["page"],
                "line_meta": {
                    "block_num": line_data["block_num"],
                    "par_num": line_data["par_num"],
                    "line_num": line_data["line_num"]
                },
                "words": line_data["words"]  # Include word-level info
            }
            blocks.append(current_block)

        elif is_header:
            found_entity = True
            current_block = {
                "type": "Header",
                "value": stripped,
                "raw": stripped,
                "context": [],
                "bbox": line_data["bbox"],
                "page": line_data["page"],
                "line_meta": {
                    "block_num": line_data["block_num"],
                    "par_num": line_data["par_num"],
                    "line_num": line_data["line_num"]
                },
                "words": line_data["words"]  # Include word-level info
            }
            blocks.append(current_block)

        elif current_block:
            current_block["context"].append({
                "text": stripped,
                "bbox": line_data["bbox"],
                "page": line_data["page"],
                "line_meta": {
                    "block_num": line_data["block_num"],
                    "par_num": line_data["par_num"],
                    "line_num": line_data["line_num"]
                },
                "words": line_data["words"]  # Include word-level info for context lines
            })

    if not found_entity:
        return "No valid dates, headers, or subheaders found in input."

    return blocks

def analyze_icr_pdf(pdf_path):
    ocr_lines = extract_text_from_pdf(pdf_path)
    result = group_entities_and_context(ocr_lines)
    return result

if __name__ == "__main__":
    pdf_path = "your_icr_file.pdf"  # Replace with actual PDF path
    results = analyze_icr_pdf(pdf_path)

    if isinstance(ocr_result, str):
      print(ocr_result)
    else:
        print("\nSample Output: Showing 5 Identified Entities with Bounding Boxes, Indices, Superscripts, and Metadata:")
        for block in ocr_result[:5]:  # Only show first 5
            print(f"\n[{block['type']}] {block['value']} (Page {block['page']})")
            print(f"Bounding Box: {block['bbox']}")
            print(f"Line Meta: {block['line_meta']}")
            print("Words:")
            for w in block["words"]:
                print(f"  - Word: '{w['word']}', Index: {w['index']}, Superscript: {w['superscript']}, BBox: {w['bbox']}, Conf: {w['conf']}")

            if block["context"]:
                print("Context:")
                for ctx in block["context"]:
                    print(f"  - (Page {ctx['page']}) {ctx['text']}")
                    print(f"    BBox: {ctx['bbox']}")
                    print(f"    Meta: {ctx['line_meta']}")
                    print("    Words:")
                    for w in ctx["words"]:
                        print(f"      * Word: '{w['word']}', Index: {w['index']}, Superscript: {w['superscript']}, BBox: {w['bbox']}, Conf: {w['conf']}")
            else:
                print("No meaningful context found.")
            print("=" * 60)


In [None]:
# [
#     {
#         "type": "Header",
#         "value": "RESUME",
#         "raw": "RESUME",
#         "bbox": [(50, 100, 150, 130)],  # bounding box for "RESUME"
#         "page": 1,
#         "line_meta": {
#             "block_num": 1,
#             "par_num": 1,
#             "line_num": 1
#         },
#         "words": [
#             {
#                 "word": "RESUME",
#                 "bbox": (50, 100, 150, 130),
#                 "conf": 96,      # OCR confidence
#                 "index": 5,      # word index from pytesseract output
#                 "superscript": False
#             }
#         ],
#         "context": [
#             {
#                 "text": "John Doe is a software engineer with 5 years of experience.",
#                 "bbox": [(50, 140, 500, 160)],  # bounding box of this line
#                 "page": 1,
#                 "line_meta": {
#                     "block_num": 1,
#                     "par_num": 2,
#                     "line_num": 2
#                 },
#                 "words": [
#                     {"word": "John", "bbox": (50, 140, 90, 160), "conf": 95, "index": 6, "superscript": False},
#                     {"word": "Doe", "bbox": (95, 140, 130, 160), "conf": 94, "index": 7, "superscript": False},
#                     {"word": "is", "bbox": (135, 140, 160, 160), "conf": 96, "index": 8, "superscript": False},
#                     {"word": "a", "bbox": (165, 140, 175, 160), "conf": 93, "index": 9, "superscript": False},
#                     {"word": "software", "bbox": (180, 140, 250, 160), "conf": 97, "index": 10, "superscript": False},
#                     {"word": "engineer", "bbox": (255, 140, 320, 160), "conf": 96, "index": 11, "superscript": False},
#                     {"word": "with", "bbox": (325, 140, 360, 160), "conf": 95, "index": 12, "superscript": False},
#                     {"word": "5", "bbox": (365, 140, 375, 160), "conf": 90, "index": 13, "superscript": False},
#                     {"word": "years", "bbox": (380, 140, 420, 160), "conf": 96, "index": 14, "superscript": False},
#                     {"word": "of", "bbox": (425, 140, 450, 160), "conf": 94, "index": 15, "superscript": False},
#                     {"word": "experience.", "bbox": (455, 140, 520, 160), "conf": 97, "index": 16, "superscript": False}
#                 ]
#             },
#             {
#                 "text": "He specializes in Python and Machine Learning.",
#                 "bbox": [(50, 170, 450, 190)],
#                 "page": 1,
#                 "line_meta": {
#                     "block_num": 1,
#                     "par_num": 3,
#                     "line_num": 3
#                 },
#                 "words": [
#                     {"word": "He", "bbox": (50, 170, 70, 190), "conf": 95, "index": 17, "superscript": False},
#                     {"word": "specializes", "bbox": (75, 170, 150, 190), "conf": 94, "index": 18, "superscript": False},
#                     {"word": "in", "bbox": (155, 170, 180, 190), "conf": 95, "index": 19, "superscript": False},
#                     {"word": "Python", "bbox": (185, 170, 230, 190), "conf": 96, "index": 20, "superscript": False},
#                     {"word": "and", "bbox": (235, 170, 260, 190), "conf": 94, "index": 21, "superscript": False},
#                     {"word": "Machine", "bbox": (265, 170, 315, 190), "conf": 96, "index": 22, "superscript": False},
#                     {"word": "Learning.", "bbox": (320, 170, 390, 190), "conf": 95, "index": 23, "superscript": False}
#                 ]
#             }
#         ]
#     }
# ]


In [None]:
#Using data from bucket

In [None]:
import pytesseract
from pdf2image import convert_from_path
import re
from collections import defaultdict
import json
import os

# Vertex AI imports
from google.cloud import aiplatform
from vertexai.language_models import TextGenerationModel

# For reading files from GCS
import gcsfs

# Initialize Vertex AI
aiplatform.init(
    project="your-gcp-project-id",  #  Replace with your GCP project ID
    location="us-central1"
)

def download_pdf_from_gcs(gcs_path, local_path="temp_icr_file.pdf"):
    fs = gcsfs.GCSFileSystem()
    with fs.open(gcs_path, 'rb') as f:
        file_bytes = f.read()
    with open(local_path, 'wb') as out_file:
        out_file.write(file_bytes)
    return local_path

def extract_text_from_pdf(pdf_path):
    pages = convert_from_path(pdf_path)
    all_lines = []

    for page_num, page in enumerate(pages):
        ocr_data = pytesseract.image_to_data(page, output_type=pytesseract.Output.DICT)
        n_boxes = len(ocr_data['text'])

        line_map = defaultdict(lambda: {
            "text": "",
            "words": [],
            "bbox": [],
            "page": page_num + 1,
            "block_num": None,
            "par_num": None,
            "line_num": None
        })

        for i in range(n_boxes):
            word = ocr_data['text'][i].strip()
            conf = int(ocr_data['conf'][i])
            if not word or conf < 0:
                continue

            block_num = ocr_data['block_num'][i]
            par_num = ocr_data['par_num'][i]
            line_num = ocr_data['line_num'][i]

            line_id = (block_num, par_num, line_num)

            left = ocr_data['left'][i]
            top = ocr_data['top'][i]
            width = ocr_data['width'][i]
            height = ocr_data['height'][i]
            bottom = top + height

            bbox = (left, top, left + width, bottom)

            line = line_map[line_id]
            line["text"] += word + ' '
            line["block_num"] = block_num
            line["par_num"] = par_num
            line["line_num"] = line_num
            line["bbox"].append(bbox)

            line["words"].append({
                "word": word,
                "bbox": bbox,
                "conf": conf,
                "index": i,
                "superscript": None
            })

        for line in line_map.values():
            if not line["words"]:
                continue
            avg_top = sum(b[1] for b in line["bbox"]) / len(line["bbox"])
            for word_data in line["words"]:
                top = word_data["bbox"][1]
                height = word_data["bbox"][3] - word_data["bbox"][1]
                word_data["superscript"] = top < (avg_top - height * 0.3)

        for line in line_map.values():
            all_lines.append({
                "text": line["text"].strip(),
                "words": line["words"],
                "bbox": line["bbox"],
                "page": line["page"],
                "block_num": line["block_num"],
                "par_num": line["par_num"],
                "line_num": line["line_num"]
            })

    return all_lines

def extract_dates(line_text):
    date_pattern = r'\b(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{2}[-/]\d{2}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4})\b'
    return re.findall(date_pattern, line_text)

def is_probable_footnote(line_text):
    return (
        re.match(r'^\(?\d{1,2}\)?[\.\s]', line_text) or
        len(line_text.strip()) < 30 and bool(re.search(r'\d{4}', line_text)) or
        any(kw in line_text.lower() for kw in ['note:', 'ref.', 'copyright', 'source:', 'doi'])
    )

def is_header_like(line_text):
    line_text = line_text.strip()
    if not line_text or len(line_text) > 150:
        return False
    has_numbering = re.match(r'^(\d+(\.\d+)*[\)\.]?)\s+', line_text)
    has_superscript = re.match(r'^[¹²³⁴⁵⁶⁷⁸⁹⁰]+\s*', line_text)
    is_caps = line_text.isupper()
    is_title = line_text.istitle()
    return has_numbering or has_superscript or is_caps or is_title

def group_entities_and_context(lines):
    blocks = []
    current_block = None
    found_entity = False

    for line_data in lines:
        stripped = line_data["text"].strip()
        if not stripped or is_probable_footnote(stripped):
            continue

        dates = extract_dates(stripped)
        is_header = is_header_like(stripped)

        if dates:
            found_entity = True
            current_block = {
                "type": "Date",
                "value": dates[0],
                "raw": stripped,
                "context": [],
                "bbox": line_data["bbox"],
                "page": line_data["page"],
                "line_meta": {
                    "block_num": line_data["block_num"],
                    "par_num": line_data["par_num"],
                    "line_num": line_data["line_num"]
                },
                "words": line_data["words"]
            }
            blocks.append(current_block)

        elif is_header:
            found_entity = True
            current_block = {
                "type": "Header",
                "value": stripped,
                "raw": stripped,
                "context": [],
                "bbox": line_data["bbox"],
                "page": line_data["page"],
                "line_meta": {
                    "block_num": line_data["block_num"],
                    "par_num": line_data["par_num"],
                    "line_num": line_data["line_num"]
                },
                "words": line_data["words"]
            }
            blocks.append(current_block)

        elif current_block:
            current_block["context"].append({
                "text": stripped,
                "bbox": line_data["bbox"],
                "page": line_data["page"],
                "line_meta": {
                    "block_num": line_data["block_num"],
                    "par_num": line_data["par_num"],
                    "line_num": line_data["line_num"]
                },
                "words": line_data["words"]
            })

    return blocks if found_entity else "No valid dates, headers, or subheaders found."

def analyze_icr_pdf_from_gcs(gcs_pdf_path):
    local_path = download_pdf_from_gcs(gcs_pdf_path)
    ocr_lines = extract_text_from_pdf(local_path)
    result = group_entities_and_context(ocr_lines)
    os.remove(local_path)
    return result

# Usage:
if __name__ == "__main__":
    gcs_path = "gs://your-bucket-name/path/to/your.pdf"  # Replace with your GCS PDF path
    ocr_result = analyze_icr_pdf_from_gcs(gcs_path)

    if isinstance(ocr_result, str):
        print(ocr_result)
    else:
        print("\nSample Output: Showing 5 Identified Entities with Bounding Boxes, Indices, Superscripts, and Metadata:")
        for block in ocr_result[:5]:  # Only show first 5
            print(f"\n[{block['type']}] {block['value']} (Page {block['page']})")
            print(f"Bounding Box: {block['bbox']}")
            print(f"Line Meta: {block['line_meta']}")
            print("Words:")
            for w in block["words"]:
                print(f"  - Word: '{w['word']}', Index: {w['index']}, Superscript: {w['superscript']}, BBox: {w['bbox']}, Conf: {w['conf']}")

            if block["context"]:
                print("Context:")
                for ctx in block["context"]:
                    print(f"  - (Page {ctx['page']}) {ctx['text']}")
                    print(f"    BBox: {ctx['bbox']}")
                    print(f"    Meta: {ctx['line_meta']}")
                    print("    Words:")
                    for w in ctx["words"]:
                        print(f"      * Word: '{w['word']}', Index: {w['index']}, Superscript: {w['superscript']}, BBox: {w['bbox']}, Conf: {w['conf']}")
            else:
                print("No meaningful context found.")
            print("=" * 60)


In [None]:
import pytesseract
from pdf2image import convert_from_path
import re
from collections import defaultdict
import json
import os

# Vertex AI imports
from google.cloud import aiplatform
from vertexai.language_models import TextGenerationModel

# For reading files from GCS
import gcsfs

# Initialize Vertex AI
aiplatform.init(
    project="your-gcp-project-id",  # Replace with your GCP project ID
    location="us-central1"
)

def download_pdf_from_gcs(gcs_path, local_path="temp_icr_file.pdf"):
    fs = gcsfs.GCSFileSystem()
    with fs.open(gcs_path, 'rb') as f:
        file_bytes = f.read()
    with open(local_path, 'wb') as out_file:
        out_file.write(file_bytes)
    return local_path

def extract_text_from_pdf(pdf_path):
    pages = convert_from_path(pdf_path)
    all_lines = []

    for page_num, page in enumerate(pages):
        ocr_data = pytesseract.image_to_data(page, output_type=pytesseract.Output.DICT)
        n_boxes = len(ocr_data['text'])

        line_map = defaultdict(lambda: {
            "text": "",
            "words": [],
            "bbox": [],
            "page": page_num + 1,
            "block_num": None,
            "par_num": None,
            "line_num": None
        })

        for i in range(n_boxes):
            word = ocr_data['text'][i].strip()
            conf = int(ocr_data['conf'][i])
            if not word or conf < 0:
                continue

            block_num = ocr_data['block_num'][i]
            par_num = ocr_data['par_num'][i]
            line_num = ocr_data['line_num'][i]

            line_id = (block_num, par_num, line_num)

            left = ocr_data['left'][i]
            top = ocr_data['top'][i]
            width = ocr_data['width'][i]
            height = ocr_data['height'][i]
            bottom = top + height

            bbox = (left, top, left + width, bottom)

            line = line_map[line_id]
            line["text"] += word + ' '
            line["block_num"] = block_num
            line["par_num"] = par_num
            line["line_num"] = line_num
            line["bbox"].append(bbox)

            line["words"].append({
                "word": word,
                "bbox": bbox,
                "conf": conf,
                "index": i,
                "superscript": None
            })

        for line in line_map.values():
            if not line["words"]:
                continue
            avg_top = sum(b[1] for b in line["bbox"]) / len(line["bbox"])
            for word_data in line["words"]:
                top = word_data["bbox"][1]
                height = word_data["bbox"][3] - word_data["bbox"][1]
                word_data["superscript"] = top < (avg_top - height * 0.3)

        for line in line_map.values():
            all_lines.append({
                "text": line["text"].strip(),
                "words": line["words"],
                "bbox": line["bbox"],
                "page": line["page"],
                "block_num": line["block_num"],
                "par_num": line["par_num"],
                "line_num": line["line_num"]
            })

    return all_lines

def extract_dates(line_text):
    date_pattern = r'\b(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{2}[-/]\d{2}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4})\b'
    return re.findall(date_pattern, line_text)

def is_probable_footnote(line_text):
    return (
        re.match(r'^\(?\d{1,2}\)?[\.\s]', line_text) or
        len(line_text.strip()) < 30 and bool(re.search(r'\d{4}', line_text)) or
        any(kw in line_text.lower() for kw in ['note:', 'ref.', 'copyright', 'source:', 'doi'])
    )

def is_header_like(line_text):
    line_text = line_text.strip()
    if not line_text or len(line_text) > 150:
        return False
    has_numbering = re.match(r'^(\d+(\.\d+)*[\)\.]?)\s+', line_text)
    has_superscript = re.match(r'^[¹²³⁴⁵⁶⁷⁸⁹⁰]+\s*', line_text)
    is_caps = line_text.isupper()
    is_title = line_text.istitle()
    return has_numbering or has_superscript or is_caps or is_title

def group_entities_and_context(lines):
    blocks = []
    current_block = None
    found_entity = False

    for line_data in lines:
        stripped = line_data["text"].strip()
        if not stripped or is_probable_footnote(stripped):
            continue

        dates = extract_dates(stripped)
        is_header = is_header_like(stripped)

        if dates:
            found_entity = True
            current_block = {
                "type": "Date",
                "value": dates[0],
                "raw": stripped,
                "context": [],
                "bbox": line_data["bbox"],
                "page": line_data["page"],
                "line_meta": {
                    "block_num": line_data["block_num"],
                    "par_num": line_data["par_num"],
                    "line_num": line_data["line_num"]
                },
                "words": line_data["words"]
            }
            blocks.append(current_block)

        elif is_header:
            found_entity = True
            current_block = {
                "type": "Header",
                "value": stripped,
                "raw": stripped,
                "context": [],
                "bbox": line_data["bbox"],
                "page": line_data["page"],
                "line_meta": {
                    "block_num": line_data["block_num"],
                    "par_num": line_data["par_num"],
                    "line_num": line_data["line_num"]
                },
                "words": line_data["words"]
            }
            blocks.append(current_block)

        elif current_block:
            current_block["context"].append({
                "text": stripped,
                "bbox": line_data["bbox"],
                "page": line_data["page"],
                "line_meta": {
                    "block_num": line_data["block_num"],
                    "par_num": line_data["par_num"],
                    "line_num": line_data["line_num"]
                },
                "words": line_data["words"]
            })

    return blocks if found_entity else "No valid dates, headers, or subheaders found."

def ocr_blocks_to_text(blocks):
    text_output = []
    for block in blocks:
        header_or_date = f"{block['type']}: {block['value']}"
        text_output.append(header_or_date)
        if block.get("context"):
            for ctx in block["context"]:
                text_output.append(f"  - {ctx['text']}")
        text_output.append("-" * 40)
    return "\n".join(text_output)

def call_gemini_llm(ocr_text):
    prompt = f"""
You are given OCR extracted blocks from a document.
Each block has a 'Header' or 'Date' followed by associated context lines.

Please organize the output in the following format:
Each Header or Date should be printed as a title, and the context lines under it with bullet points.

Example format:
Header: <Header Name>
- <Context Line 1>
- <Context Line 2>

Date: <Date>
- <Context Line 1>
- <Context Line 2>

If no context is available, just print the header/date without bullets.

Here is the OCR text:
\"\"\"
{ocr_text}
\"\"\"
"""
    model = TextGenerationModel.from_pretrained("gemini-1.0-pro")
    response = model.predict(
        prompt=prompt,
        temperature=0.3,
        max_output_tokens=2048,
    )
    return response.text

def analyze_icr_pdf_from_gcs(gcs_pdf_path):
    local_path = download_pdf_from_gcs(gcs_pdf_path)
    ocr_lines = extract_text_from_pdf(local_path)
    result = group_entities_and_context(ocr_lines)
    os.remove(local_path)
    return result

# ======= Main Execution =======
if __name__ == "__main__":
    gcs_path = "gs://your-bucket-name/path/to/your.pdf"  # Update this path
    ocr_result = analyze_icr_pdf_from_gcs(gcs_path)

    if isinstance(ocr_result, str):
        print("OCR Error or no valid content:", ocr_result)
    else:
        print("\nSample Output: Showing 5 Identified Entities")
        for block in ocr_result[:5]:
            print(f"[{block['type']}] {block['value']} (Page {block['page']})")
            print(f"Context lines: {len(block.get('context', []))}")
            print("-" * 30)

        ocr_text = ocr_blocks_to_text(ocr_result)
        print("\n=== OCR Text for LLM ===")
        print(ocr_text)

        llm_response = call_gemini_llm(ocr_text)
        print("\n=== LLM Response ===")
        print(llm_response)


In [None]:
#expected ouput

# Header: Executive Summary
# - The report highlights key trends in revenue growth.
# - Focus areas include marketing ROI and customer retention.

# Date: January 10, 2023
# - Annual performance review conducted.
# - Strategy planning sessions initiated.

# Header: Recommendations
# - Improve data integration workflows.
# - Reevaluate vendor contracts in Q2.
