In [1]:
#Cell 1 — Install dependencies (run once)
# Install required Python packages and Tesseract OCR
!pip install --upgrade pip
!pip install pymupdf pillow pytesseract python-docx openpyxl

# Install system tesseract (for OCR)
!apt-get update -qq
!apt-get install -y -qq tesseract-ocr

# Optional: LibreOffice (uncomment if you need .docx/.xlsx -> .pdf conversion inside Colab)
# Note: LibreOffice is large; install only if needed.
# !apt-get install -y -qq libreoffice


Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3
Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m67.7 MB/s[0m  [33m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading 

In [2]:
#Cell 2 — Helper functions (core implementation)
# Colab-adapted implementation: searching + annotating PDF/images and optional office conversion
from pathlib import Path
import tempfile, shutil, subprocess
import fitz                   # PyMuPDF
from PIL import Image, ImageDraw
import pytesseract
from google.colab import files

def convert_office_to_pdf(input_path: Path, out_dir: Path) -> Path:
    if not shutil.which("soffice"):
        raise EnvironmentError("LibreOffice (soffice) not found. Install it or convert .docx/.xlsx to PDF externally.")
    args = [
        "soffice",
        "--headless",
        "--convert-to",
        "pdf",
        "--outdir",
        str(out_dir),
        str(input_path),
    ]
    subprocess.run(args, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out_pdf = out_dir / (input_path.stem + ".pdf")
    if not out_pdf.exists():
        raise FileNotFoundError(f"Conversion failed; expected: {out_pdf}")
    return out_pdf

def annotate_image_pil(img_pil: Image.Image, search_text: str, case_sensitive=False, stroke_width=3):
    ocr_data = pytesseract.image_to_data(img_pil, output_type=pytesseract.Output.DICT)
    draw = ImageDraw.Draw(img_pil)
    n = len(ocr_data.get("level", []))
    words = []
    for i in range(n):
        words.append({
            "text": ocr_data['text'][i] or "",
            "left": ocr_data['left'][i],
            "top": ocr_data['top'][i],
            "width": ocr_data['width'][i],
            "height": ocr_data['height'][i]
        })

    for start in range(len(words)):
        candidate = ""
        for end in range(start, min(start + 12, len(words))):
            wtext = words[end]['text'] or ""
            candidate = candidate + (" " if candidate else "") + wtext
            if (case_sensitive and search_text == candidate) or (not case_sensitive and search_text.lower() == candidate.lower()):
                lefts = [words[k]['left'] for k in range(start, end + 1)]
                tops  = [words[k]['top']  for k in range(start, end + 1)]
                rights = [words[k]['left'] + words[k]['width'] for k in range(start, end + 1)]
                bottoms= [words[k]['top']  + words[k]['height'] for k in range(start, end + 1)]
                bbox = (min(lefts), min(tops), max(rights), max(bottoms))
                draw.rectangle(bbox, outline="red", width=stroke_width)
                break
    return img_pil


def annotate_image_file(input_path: Path, output_path: Path, search_text: str, case_sensitive=False):
    img = Image.open(str(input_path)).convert("RGB")
    annotated = annotate_image_pil(img, search_text, case_sensitive=case_sensitive)
    annotated.save(str(output_path))
    return output_path


def annotate_pdf_search_colab(input_pdf: Path, output_pdf: Path, search_text: str, case_sensitive=False, stroke_width=1.5):
    doc = fitz.open(str(input_pdf))
    text_doc = fitz.open()
    scanned_images = []

    try:
        for p_idx in range(len(doc)):
            page = doc[p_idx]
            page_text = page.get_text("text").strip()
            if page_text:

                rects = []
                try:
                    rects = page.search_for(search_text)
                except Exception:
                    rects = []
                if not rects and not case_sensitive:
                    blocks = page.get_text("dict")
                    lowered = search_text.lower()
                    for block in blocks.get("blocks", []):
                        for line in block.get("lines", []):
                            for span in line.get("spans", []):
                                span_text = span.get("text", "")
                                if lowered in span_text.lower():
                                    bbox = span.get("bbox")
                                    if bbox:
                                        rects.append(fitz.Rect(bbox))

                text_doc.insert_pdf(doc, from_page=p_idx, to_page=p_idx)
                dest_page = text_doc[-1]
                for r in rects:
                    annot = dest_page.add_rect_annot(r)
                    annot.set_colors(stroke=(1,0,0))
                    annot.set_border(width=stroke_width)
                    annot.update()
            else:

                zoom = 2.0
                mat = fitz.Matrix(zoom, zoom)
                pix = page.get_pixmap(matrix=mat, alpha=False)
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                annotated_img = annotate_image_pil(img, search_text, case_sensitive=case_sensitive, stroke_width=3)
                scanned_images.append(annotated_img)

        if len(text_doc) and not scanned_images:
            text_doc.save(str(output_pdf), deflate=True, garbage=3)
        elif len(text_doc) and scanned_images:

            tmp_text = Path(tempfile.mkdtemp()) / "text_pages.pdf"
            text_doc.save(str(tmp_text), deflate=True, garbage=3)
            tmp_scanned = Path(tempfile.mkdtemp()) / "scanned_pages.pdf"
            scanned_images[0].save(str(tmp_scanned), save_all=True, append_images=scanned_images[1:])
            out = fitz.open()
            out.insert_pdf(fitz.open(str(tmp_text)))
            out.insert_pdf(fitz.open(str(tmp_scanned)))
            out.save(str(output_pdf), deflate=True, garbage=3)
            out.close()
            shutil.rmtree(tmp_text.parent, ignore_errors=True)
            shutil.rmtree(tmp_scanned.parent, ignore_errors=True)
        elif scanned_images and not len(text_doc):

            scanned_images[0].save(str(output_pdf), save_all=True, append_images=scanned_images[1:])
        else:

            doc.save(str(output_pdf))
    finally:
        doc.close()
        try:
            text_doc.close()
        except:
            pass


def download(path_like):
    files.download(str(path_like))


In [3]:
#Cell 3 — Upload the file you want to process
from google.colab import files
uploaded = files.upload()
input_filename = next(iter(uploaded.keys()))
print("Uploaded:", input_filename)
input_path = Path(input_filename)


Saving DOCTotalAmount.pdf to DOCTotalAmount.pdf
Uploaded: DOCTotalAmount.pdf


In [5]:
#Cell 4 — Set search phrase and run the annotation
SEARCH_TEXT = "Total Amount"
CASE_SENSITIVE = False

ext = input_path.suffix.lower()
if ext == ".pdf":
    output_path = input_path.with_name(input_path.stem + "_annotated.pdf")
    print("Processing PDF...")
    annotate_pdf_search_colab(input_path, output_path, SEARCH_TEXT, case_sensitive=CASE_SENSITIVE)
    print("Annotated PDF saved to:", output_path)
elif ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]:
    output_path = input_path.with_name(input_path.stem + "_annotated" + ext)
    print("Processing image...")
    annotate_image_file(input_path, output_path, SEARCH_TEXT, case_sensitive=CASE_SENSITIVE)
    print("Annotated image saved to:", output_path)
elif ext in [".docx", ".xlsx"]:

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdirp = Path(tmpdir)
        print("Converting office file to PDF (requires LibreOffice)...")
        converted = convert_office_to_pdf(input_path, tmpdirp)
        output_path = input_path.with_name(input_path.stem + "_annotated.pdf")
        annotate_pdf_search_colab(converted, output_path, SEARCH_TEXT, case_sensitive=CASE_SENSITIVE)
        print("Converted + annotated PDF saved to:", output_path)
else:
    raise SystemExit("Unsupported file type: " + ext)


Processing PDF...
Annotated PDF saved to: DOCTotalAmount_annotated.pdf


In [6]:
#Cell 5 — Download resulting file
print("Attempting to download:", output_path)
download(output_path)


Attempting to download: DOCTotalAmount_annotated.pdf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>