In [None]:
from src.preprocessing.doc_ai.processor import DocAIProcessor
import os
from google.cloud import documentai
from itertools import chain

# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'prj-ilios-ai.json'

# Simple processor 

In [None]:
processor = DocAIProcessor(location="us",
                           project_id="602280418311",
                           processor_id="e977fdd46ee23308")
doc_sequence = processor.process_document(
    file_path="/Users/odeine/PycharmProjects/ilios-DocAI/data/documents/Site Lease - Novel - Bartel (ES).pdf")

print(f"Extracted text: \n\n {doc_sequence.get_paragraphs()[1]}")

In [None]:
all_paragraphs = doc_sequence.get_paragraphs()

In [None]:
len(doc_sequence.get_all_text())

In [None]:
all_paragraphs[2]

# Table processor 

In [None]:
table_processor = DocAIProcessor(
    location="us",
    project_id="602280418311",
    processor_id="d89e8046e872374",
    processor_version_id="pretrained-form-parser-v2.1-2023-06-26")
doc_sequence_table = table_processor.process_document(
    file_path="data/Site Lease - Novel - Bartel.pdf")

print(f"Extracted text: \n\n {doc_sequence_table.get_paragraphs()[1]}")

In [None]:
def concat_strings(strings, string_limit=700, paragraph_limit=200):
    # Initialize an empty list to hold the concatenated strings
    concatenated = []
    # Initialize an empty string to start concatenation
    current_string = ""

    for s in strings:
        # Check if the current string is less than 100 characters
        # and if adding it would keep the total under 500 characters
        if len(s) < paragraph_limit and len(current_string) + len(s) <= string_limit:
            # Add the string to the current concatenation
            current_string += s
        else:
            # If the current string is full or the next string is too long,
            # move to the next string and reset the current_string
            if current_string:  # Avoid adding empty strings
                concatenated.append(current_string)
            current_string = s  # Start a new concatenation with the current string

    # Add the last concatenated string if it's not empty
    if current_string:
        concatenated.append(current_string)

    return concatenated

In [None]:
def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )

def parse_table_to_lines(table: documentai.Document.Page.Table, text: str) -> str:
    lines = ["\nTABLE:"]
    # Extract column names from the header row
    column_names = [layout_to_text(cell.layout, text).strip().replace(':', '').replace("\n", '') for cell in list(table.header_rows)[0].cells]
    for row in list(table.body_rows):
        line = []
        for cell, column_name in zip(row.cells, column_names):
            cell_text = layout_to_text(cell.layout, text).replace("\n", '')
            # Include the column name along with the cell text
            line.append(f"{column_name} {cell_text.strip()}")
        lines.append(": ".join(line))
    return "\n".join(lines) + "\n"

In [None]:
text = doc_sequence_table.documents[0].text
for table in doc_sequence_table.documents[0].pages[0].tables:
    print(parse_table_to_lines(table, text))

In [None]:
def get_all_text(documents) -> str:
    """Returns the text of all the documents"""
    all_text = "\n".join([doc.text for doc in documents])
    parsed_tables = [[[parse_table_to_lines(table, document.text) for table in
                       page.tables] for page in document.pages] for document in
                     documents]
    all_text += "\n".join(list(chain.from_iterable(list(chain.from_iterable(parsed_tables)))))

    return all_text

In [None]:
get_all_text(doc_sequence_table.documents)