# R&D Section Navigation

In [2]:
import os

from src.doc_ai.processors import DOC_AI_PROCESSOR
from src.doc_ai.processor import DocAIProcessor

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/Users/mkaczo/Desktop/Projects/iliOS/iliOS-DocAI/notebooks/prj-ilios-ai.json'
PROJECT_ID="602280418311"
DOC_AI_LOCATION="us"

processor: DocAIProcessor = DocAIProcessor(
    location=DOC_AI_LOCATION,
    project_id=PROJECT_ID,
    processor_id=DOC_AI_PROCESSOR["PROCESSOR"],
)

# Define the document to process
file_path = "gs://doc_ai_storage/site-lease/documents/Brixmor-Blue Sky Felicita Plaza Lease - 7_17_19 (BSU Sig_ Deed Included).pdf"
mime_type = "application/pdf"
field_mask = None  # Optional
pages = None  # Optional

# Process the document
processed_file = processor.process_document(
    file_path=file_path,
    mime_type=mime_type,
    field_mask=field_mask,
    pages=pages,
)


In [17]:
processed_file.get_all_text()[37605:39303]

'shall include specifications, repair and replacement details for the Roof and/or Building Repairs\nand the proposed materials and methods for the Roof and/or Building Repairs and which\nStructural Estimate shall be subject to the review and approval of the Landlord, which approval\nshall be in Landlord\'s sole and absolute discretion. The date of Landlord\'s written approval of\nthe Structural Estimate, provided to Tenant in accordance with Paragraph X.B of this Lease shall\nconstitute the "Structural Approval Date." Landlord\'s approvals of the Structural Analysis and\nStructural Estimate do not constitute a representation by Landlord that same are true and accurate\nor relieve Tenant from any of its obligations under this Lease, including but not limited to\nParagraph VIII.E of this Lease. If the Structural Approval Date does not occur within one-\nhundred and twenty (120) days of the Effective Date, either Landlord or Tenant may terminate\nthis Lease by written notice to the other.

In [30]:
processed_file.doc_ai_repr[0].pages[0].blocks[1].layout.bounding_poly

vertices {
  x: 661
  y: 904
}
vertices {
  x: 1094
  y: 904
}
vertices {
  x: 1094
  y: 928
}
vertices {
  x: 661
  y: 928
}
normalized_vertices {
  x: 0.375995457
  y: 0.397362649
}
normalized_vertices {
  x: 0.622298062
  y: 0.397362649
}
normalized_vertices {
  x: 0.622298062
  y: 0.407912076
}
normalized_vertices {
  x: 0.375995457
  y: 0.407912076
}

In [27]:
processed_file.get_all_text()[11:32]

'SITE LEASE AGREEMENT\n'

In [None]:
661, 904
1094,904
1094, 928
661, 928

In [79]:
from typing import Sequence
from google.cloud import documentai


def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )


def print_paragraphs(
        paragraphs: Sequence[documentai.Document.Page.Paragraph], text: str
) -> None:
    print(f"    {len(paragraphs)} paragraphs detected:")
    first_paragraph_text = layout_to_text(paragraphs[0].layout, text)
    print(f"        First paragraph text: {repr(first_paragraph_text)}")
    last_paragraph_text = layout_to_text(paragraphs[-1].layout, text)
    print(f"        Last paragraph text: {repr(last_paragraph_text)}")


def print_blocks(blocks: Sequence[documentai.Document.Page.Block], text: str) -> None:
    print(f"    {len(blocks)} blocks detected:")
    first_block_text = layout_to_text(blocks[0].layout, text)
    print(f"        First text block: {repr(first_block_text)}")
    last_block_text = layout_to_text(blocks[-1].layout, text)
    print(f"        Last text block: {repr(last_block_text)}")

In [80]:
from typing import Tuple, Dict


def get_text_coordinates(processed_file) -> Dict[str, Tuple[int, int, int, int]]:
    text = processed_file.get_all_text()
    blocks = []
    polygons = []
    pages = []
    for i, page in enumerate(processed_file.doc_ai_repr[0].pages):
        for block in page.blocks:
            blocks.append(layout_to_text(block.layout, text))
            pages.append(i)
            polygons.append(
                (
                    block.layout.bounding_poly.vertices[0].x,
                    block.layout.bounding_poly.vertices[0].y,
                    block.layout.bounding_poly.vertices[2].x,
                    block.layout.bounding_poly.vertices[2].y,
                )
            )
    return {"blocks": blocks, "polygons": polygons, "pages": pages}

In [81]:
coords = get_text_coordinates(processed_file)

In [83]:
print(len(coords['blocks']))
print(len(coords['polygons']))
print(len(coords['pages']))

187
187
187


In [84]:
from typing import Any


def map_text_to_coords(text: str, coords: Dict[str, Any]) -> Any:
    for i, block in enumerate(coords["blocks"]):
        if text in block:
            return coords["polygons"][i], coords["pages"][i]
    return None
    

In [85]:
map_text_to_coords("LEASE AGREEMENT", coords)

((661, 904, 1094, 928), 0)

## Draw bboxes

In [41]:
import fitz  # PyMuPDF
from PIL import Image, ImageDraw

# Load the PDF file
pdf_path = "example.pdf"
doc = fitz.open(pdf_path)

# Example bounding box coordinates (replace with actual coordinates)
bounding_boxes = [
    {"page": 1, "bbox": [661, 1094, 904, 928]},
    # Add more bounding boxes as needed
]

# Iterate through the bounding boxes and draw them on the PDF
for box in bounding_boxes:
    page = doc[box["page"]]
    rect = fitz.Rect(box["bbox"])
    page.insert(rect, color=(1, 0, 0), width=2)  # Red color with width 2

# Save the modified PDF
output_path = "output.pdf"
doc.save(output_path)
doc.close()

AttributeError: 'Page' object has no attribute 'insert'

In [49]:
doc[0].insert_textbox(fitz.Rect(100, 100, 200, 200), "Hello, World!", fontsize=12, color=(0, 0, 1))

79.92399954795837

In [55]:
type(doc[0])

pymupdf.Page

In [62]:
import fitz  # PyMuPDF
from PIL import Image, ImageDraw

# Load the PDF file
pdf_path = "example.pdf"
doc = fitz.open(pdf_path)
doc[0].add_rect_annot(fitz.Rect(361,304,694, 628))
output_path = "output.pdf"
doc.save(output_path)
doc.close()

In [63]:
x, x, y,y
[661, 1094, 904, 928]

NameError: name 'x' is not defined

In [None]:
vertices {
    x: 661
    y: 904
}
vertices {
    x: 1094
    y: 904
}
vertices {
    x: 1094
    y: 928
}
vertices {
    x: 661
    y: 928
}
normalized_vertices {
    x: 0.375995457
    y: 0.397362649
}
normalized_vertices {
    x: 0.622298062
    y: 0.397362649
}
normalized_vertices {
    x: 0.622298062
    y: 0.407912076
}
normalized_vertices {
    x: 0.375995457
    y: 0.407912076
}

In [68]:
x1 = 661*0.375
x2 = 1094*0.622
y1= 904*0.397
y2 = 928*0.407

In [69]:
x1

247.875

In [70]:
import fitz  # PyMuPDF
from PIL import Image, ImageDraw

# Load the PDF file
pdf_path = "example.pdf"
doc = fitz.open(pdf_path)
doc[0].add_rect_annot(fitz.Rect(x1,y1,x2, y2))
output_path = "output.pdf"
doc.save(output_path)
doc.close()