In [81]:
from unstructured.partition.auto import partition
from unstructured.partition.pdf import partition_pdf
from google import genai
from google.genai import types
import numpy as np
import chromadb
from dotenv import load_dotenv, dotenv_values

### Data Extraction from PDF

In [82]:
file_path = 'examples/District-Improvement-Plan-2023-2026.pdf'

elements = partition(filename=file_path, strategy='hi_res')



In [83]:
# TODO images are not always extracted correctly  (maybe extract images from pdf and get text using dedicated process)

for el in elements:
    print(type(el))
    print(el.to_dict())

<class 'unstructured.documents.elements.Title'>
{'type': 'Title', 'element_id': 'b0a8edddcabe28a95b9f8c899d8f3c1f', 'text': 'Haverhill Public Schools District Improvement Plan 2024-2026', 'metadata': {'detection_class_prob': 0.4401351809501648, 'coordinates': {'points': ((np.float64(655.5390014648438), np.float64(341.9986265157064)), (np.float64(655.5390014648438), np.float64(455.46569734903966)), (np.float64(1540.025634765625), np.float64(455.46569734903966)), (np.float64(1540.025634765625), np.float64(341.9986265157064))), 'system': 'PixelSpace', 'layout_width': 2200, 'layout_height': 1700}, 'last_modified': '2025-07-29T14:43:36', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'file_directory': 'examples', 'filename': 'District-Improvement-Plan-2023-2026.pdf'}}
<class 'unstructured.documents.elements.Image'>
{'type': 'Image', 'element_id': 'f0a22290a168232bd400af9e6df7a6f4', 'text': 'Vision Statement In partnership with our community the Haverhill Public Schoo

In [84]:
# Tables
for el in elements:
    if(el.to_dict()['type'] == "Table"):
        print(el.text)
        print("*" * 20)

. . H igh fay Ren ct pe . Collaboration ___Growth Mindset__]
********************
1. Engagement Build strong relationships with each student, and provide each student with the academic, social, emotional, and behavioral health supports necessary for success Maintain a safe, supportive learning community in every school, promoting health and safety, a culture of learning, and respect for leaders and peers. Build out the Student Support Team (SST) meeting system. Engage student families and the community in promoting student success. Provide Social and Emotional Learning (SEL) support at level 1,2,3 Explore and implement restorative justice practices in our schools . Foster a diverse and inclusive learning community with active engagement and a sense of belonging for everyone. Implement the mental health support and referral system 2. Equitable Literacy 3. Academic Excellence Implement curriculum, teaching Work as a flexible and adaptable learning methods, and support resources organizat

### Embed Data

In [85]:
load_dotenv()

gemini_client = genai.Client()

text = [el.text for el in elements]
ids = [el.to_dict()['element_id'] for el in elements]
metadatas = [el.metadata for el in elements]


embeddings = [
    np.array(el.values) for el in gemini_client.models.embed_content(
        model="gemini-embedding-001",
        contents=text[0:100]).embeddings
]

print(embeddings)

[array([-0.01190747,  0.03718229,  0.00629802, ...,  0.00041152,
        0.03714312,  0.00088733], shape=(3072,)), array([-0.01955111,  0.03578507, -0.00951432, ..., -0.00501047,
        0.02567012, -0.00814625], shape=(3072,)), array([-0.02290774,  0.03115959, -0.00417629, ...,  0.00600593,
        0.01982936,  0.01385298], shape=(3072,)), array([-0.03708896,  0.01483862,  0.01611459, ..., -0.00824163,
        0.01788407,  0.00806787], shape=(3072,)), array([ 0.01690046, -0.00258094,  0.01746005, ...,  0.00046592,
        0.00349645, -0.00182337], shape=(3072,)), array([ 0.00621846,  0.01853588,  0.00786256, ..., -0.02499064,
       -0.00030855, -0.00268315], shape=(3072,)), array([ 0.00719599,  0.03422022,  0.00465892, ..., -0.00472026,
        0.00510955, -0.00553408], shape=(3072,)), array([-0.01283999,  0.03597717,  0.01117079, ...,  0.01298519,
       -0.00492289,  0.0107029 ], shape=(3072,)), array([ 0.00600577,  0.01859021, -0.00174446, ..., -0.01359437,
        0.00182832,  0.

### Store Embeddings in Chroma DB

In [88]:
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(
    name="demo_collection",
    )
collection.add(
    embeddings=embeddings,
    ids=ids[0:100],
    documents=text[0:100]
)


### Retrieve from Chroma DB

In [97]:
query = "What are the five strategic priorities?"

embedded_query = [
    np.array(el.values) for el in gemini_client.models.embed_content(
    model="gemini-embedding-001",
    contents=query).embeddings
]

print(embedded_query)

results = collection.query(
    query_embeddings=embedded_query,
    n_results=10
)

print(results["documents"])

[array([-0.00184434,  0.01903366, -0.00155508, ...,  0.00888711,
        0.01316905,  0.02143721], shape=(3072,))]
[['Strategic Objective #2-Equity/Access/DEI- Priority 1, 2, 3, 4', 'Strategic Objective #1 Whole Student - Priority 1,2, 3', 'Strategic Objective #3 Research Based Data Driven - Priority 2 and 3', 'Haverhill Public Schools Strategic Priorities as developed by the Strategic Planning Subcommittee Haverhill School Committee', '1. Engagement Build strong relationships with each student, and provide each student with the academic, social, emotional, and behavioral health supports necessary for success Maintain a safe, supportive learning community in every school, promoting health and safety, a culture of learning, and respect for leaders and peers. Build out the Student Support Team (SST) meeting system. Engage student families and the community in promoting student success. Provide Social and Emotional Learning (SEL) support at level 1,2,3 Explore and implement restorative ju

### Generate Response

In [98]:
print(f"""
    Based on the following original documents, answer the provided question.

    Original Document Context:
    {results["documents"]}

    Question:
    {query}
    """)

response = gemini_client.models.generate_content(
    model="gemini-2.5-flash",
    contents=f"""
    Based on the following original documents, answer the provided question.

    Original Document Context:
    {results["documents"]}

    Question:
    {query}
    """
)

print(response.text)


    Based on the following original documents, answer the provided question.

    Original Document Context:
    [['Strategic Objective #2-Equity/Access/DEI- Priority 1, 2, 3, 4', 'Strategic Objective #1 Whole Student - Priority 1,2, 3', 'Strategic Objective #3 Research Based Data Driven - Priority 2 and 3', 'Haverhill Public Schools Strategic Priorities as developed by the Strategic Planning Subcommittee Haverhill School Committee', '1. Engagement Build strong relationships with each student, and provide each student with the academic, social, emotional, and behavioral health supports necessary for success Maintain a safe, supportive learning community in every school, promoting health and safety, a culture of learning, and respect for leaders and peers. Build out the Student Support Team (SST) meeting system. Engage student families and the community in promoting student success. Provide Social and Emotional Learning (SEL) support at level 1,2,3 Explore and implement restorative jus