In [18]:
import openai
import pymupdf4llm
import pymupdf
import json
from fuzzysearch import find_near_matches
from langchain.text_splitter import MarkdownTextSplitter

In [34]:
client = openai.Client()

In [29]:
page_count = pymupdf.open("sample.pdf").page_count

In [31]:
# Step 1: Convert PDF to markdown
PAGES_PER_CHUNK = 50
doc_chunks = [pymupdf4llm.to_markdown("sample.pdf", pages=range(i, i + PAGES_PER_CHUNK)) for i in range(0, doc.page_count, PAGES_PER_CHUNK)]

In [32]:
len(doc_chunks)

6

In [33]:
PROMPT = """
The following is an Environmental Impact Statement (EIS).

Read it carefully and assess the following risk factors that may pose regulatory hurdles for the project:
1. Endangered Species
2. Tribal Lands
3. National Historic Sites

Cite and comment on specific text in the document that is most relevant to the risk factors above. Respond with the following format:

```json
{
    "comments" : [
        {
            "quote": "Some exact text from the document",
            "risk_factor": "Which of the above risk factors does this quote relate to?",
            "comment": "Describe how the quoted text could introduce regulatory burden related to the risk factor",
        },
        ...
    ]
}
```

Return only valid JSON. DO NOT include any other text in your response.
""".strip()

In [39]:
def get_comments_for_doc(doc, context):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": PROMPT},
            {"role": "user", "content": doc},
        ],
        response_format={"type": "json_object"},
    )

    response_object = json.loads(response.choices[0].message.content)

    comments = []
    for c in response_object["comments"]:
        quote = c["quote"]
        # Perform fuzzy search to find approximate matches
        try:
            closest_match = find_near_matches(quote, doc, max_l_dist=5)[0]
        except IndexError:
            continue

        comments.append(
            {
                "quote": {
                    "start": closest_match.start,
                    "end": closest_match.end,
                    "text": doc[closest_match.start : closest_match.end],
                },
                "comment": c["comment"],
                "metadata": {
                    "risk_factor": c["risk_factor"],
                },
            }
        )

    return {
        "markdown": doc,
        "comments": comments,
    }

In [43]:
response_obj = get_comments_for_doc(doc_chunks[1])

In [44]:
len(response_obj["comments"])

5

In [45]:
open("sample_comments_2.json", "w").write(json.dumps(response_obj, indent=4))

138086

# Construct Markdown Output

In [103]:
comments = {
    "markdown": md_text,
    "comments": [
        {
            "quote": {
                "start": start,
                "end": end,
                "text": document[start:end],
            },
            "comment": response_object["comment"],
            "metadata": {
                "risk_factor": response_object["risk_factor"],
            },
        }
    ],
}

In [105]:
open("sample_comments.json", "w").write(json.dumps(comments, indent=2))

838513

#  Scratch?

In [None]:
client = openai.Client()
response = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {"role": "system", "content": "Summarize the provided document"},
    {"role": "user", "content": md_text[:len(md_text)//2]},
    ],
  max_tokens=512
)

In [None]:
print(response.choices[0].message.content)

In [None]:
len(md_text)

In [None]:
print(doc[12].get_text())

In [None]:
import pymupdf

doc = pymupdf.open("sample.pdf")  # open a document

for page in doc[:2]:  # iterate the document pages
    text = page.get_text()  # get plain text (is in UTF-8)
    print(text)