In [13]:
import psycopg
from llama_index.core import Document as LIDocument
from docling_core.types.doc.document import DoclingDocument
from llama_index.llms.openai_like import OpenAILike
from llama_index.core.response_synthesizers import TreeSummarize
from llama_index.core.text_splitter import TokenTextSplitter



In [14]:
import os


conn_str = "postgresql://postgres:postgres@localhost:5432/postgres"
conn_str

'postgresql://postgres:postgres@localhost:5432/postgres'

In [15]:
with psycopg.connect(conn_str) as conn:
    with conn.cursor() as cur:
        cur.execute("SELECT document FROM documents WHERE id = %s", ("2b763efe-6812-4881-9900-c37208deb8ea",))
        rows = cur.fetchone()
        document_obj = rows[0]
document_obj

{'id_': '2b763efe-6812-4881-9900-c37208deb8ea',
 'text': '{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "The Lumberjack April 23 1975", "origin": {"mimetype": "application/pdf", "binary_hash": 15593282138383937568, "filename": "The Lumberjack April 23 1975.pdf"}, "furniture": {"self_ref": "#/furniture", "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "children": [{"$ref": "#/texts/0"}, {"$ref": "#/texts/1"}, {"$ref": "#/texts/2"}, {"$ref": "#/pictures/0"}, {"$ref": "#/texts/3"}, {"$ref": "#/texts/4"}, {"$ref": "#/texts/5"}, {"$ref": "#/texts/6"}, {"$ref": "#/texts/7"}, {"$ref": "#/texts/8"}, {"$ref": "#/texts/9"}, {"$ref": "#/texts/10"}, {"$ref": "#/texts/11"}, {"$ref": "#/texts/12"}, {"$ref": "#/pictures/1"}, {"$ref": "#/texts/13"}, {"$ref": "#/texts/14"}, {"$ref": "#/texts/15"}, {"$ref": "#/pictures/2"}, {"$ref": "#/texts/18"}, {"$ref": "#/texts/19"}, {"$ref": "#/texts/20"}, {"$ref": "#/texts/21"

In [16]:
import json

# Create a LlamaIndex document and convert it to a Docling document to get the content
li_doc = LIDocument.from_dict(document_obj)
dl_doc = DoclingDocument.model_validate(json.loads(li_doc.text))

content = dl_doc.export_to_markdown()
content

',;\n\nThe Arcata Plaza was the  site  of  the Inaugural Fair, \'billed as a \'historic event\'\' to initiate the bus system into  the  community  last -Sunday. Council members donned their Sunday  best, the Sweet Adelines sang and the buses provided free rides to eager takers throughout the afternoon. Students  can ride the buses for 10 cents.\n\n## \'Juggling becomes  habit  fovrninn\'\n\n<!-- image -->\n\nby Tim  Heyne\n\n.\n\nThe  art  of  juggling  requires  far more  than  the  ability  to  balance  a checkbook. -\n\nTwo  HSU  students, Doug  Barnard and Mitch Craig, began a skills  exchange  course  to  instruct interested people in the art of keeping  more  than  one  object in the  air  at the  same  time.\n\n\'At  first  we  didn\'t  think people would take us seriously,\' explained Barnard. \'\'\'However, 12 students attended the first meeting  ready  to  go.\'\'\n\nCK  , California 9552) a  ball  from  one  hand  to  the  other. CONSEQUENTLY, the instructors decided to teac

In [None]:
max_tokens = 31000
splitter = TokenTextSplitter(
    chunk_size=max_tokens,
)

llm = OpenAILike(
    model="meta-llama/Llama-3.2-3B-Instruct",
    api_key="fake",
    api_base="http://0.0.0.0:8000/v1",
    context_window=32000,
    max_tokens=1000,
)

query_str = "Summarize the chunks listed above in a few sentences, capturing the main points and core meaning of the text. Be concise and clear in your summary."

# Split the content into manageable chunks
chunks = splitter.split_text(content)

chunks

[',;\n\nThe Arcata Plaza was the  site  of  the Inaugural Fair, \'billed as a \'historic event\'\' to initiate the bus system into  the  community  last -Sunday. Council members donned their Sunday  best, the Sweet Adelines sang and the buses provided free rides to eager takers throughout the afternoon. Students  can ride the buses for 10 cents.\n\n## \'Juggling becomes  habit  fovrninn\'\n\n<!-- image -->\n\nby Tim  Heyne\n\n.\n\nThe  art  of  juggling  requires  far more  than  the  ability  to  balance  a checkbook. -\n\nTwo  HSU  students, Doug  Barnard and Mitch Craig, began a skills  exchange  course  to  instruct interested people in the art of keeping  more  than  one  object in the  air  at the  same  time.\n\n\'At  first  we  didn\'t  think people would take us seriously,\' explained Barnard. \'\'\'However, 12 students attended the first meeting  ready  to  go.\'\'\n\nCK  , California 9552) a  ball  from  one  hand  to  the  other. CONSEQUENTLY, the instructors decided to tea

In [18]:
# Summarize the chunks recursively
summarizer = TreeSummarize(
    llm=llm,
)

try:
    # Summarize the document content
    summary = summarizer.get_response(query_str, chunks)
except Exception as e:
    print(f"Error summarizing document: {e}")
    print("Document content:", chunks)
    raise

summary

" Unfortunately, I don't see any text provided for me to summarize. Please provide the text you'd like me to summarize, and I'll be happy to assist you."