In [1]:
# #%pip install -qU langchain-unstructured
# !pip install -qU "langchain-chroma>=0.1.2"
# import numpy as np
# import pandas as pd
# import bs4
import chromadb
from langchain import hub
from langchain_chroma import Chroma
# from langchain_community.document_loaders import WebBaseLoader
from uuid import uuid4
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
# from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import fitz
import pickle
from langchain_openai import ChatOpenAI
import getpass
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document as llamaDoc
from langchain.retrievers import BM25Retriever, EnsembleRetriever


In [2]:

openai_key = getpass.getpass("Enter API key for OpenAI: ")
os.environ["OPENAI_API_KEY"] = openai_key

In [3]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [5]:
def build_vector_db(persist_directory, collection_name, file_path, doc_chunks):
    '''Build a vector database with ChromaDB'''

    persistent_client = chromadb.PersistentClient()
    collection = persistent_client.get_or_create_collection(collection_name)

    vector_store = Chroma(
        client=persistent_client,
        collection_name=collection_name,
        embedding_function=embeddings,
        persist_directory=persist_directory
    )

    # Loading the pdf document
    docs = fitz.open(file_path)

    # Reading each page and splitting into chunks
    documents = []

    idx = 0
    for i,page in enumerate(docs):
        page_text = page.get_text()
        page_number = i
        
        splitter = SentenceSplitter(chunk_size= 256, chunk_overlap=20)

        nodes = splitter.get_nodes_from_documents(
            [llamaDoc(text=page_text)], show_progress=False)

        for j,node in enumerate(nodes):

            idx += 1
            documents.append(Document(
                page_content=node.text,
                metadata={"source": "DSM5",'page':page_number,'chunk':j},
                id=idx,
            ))

    with open(doc_chunks, 'wb') as f: 
        pickle.dump(documents, f)


In [6]:
persist_directory = '../data/chroma'
collection_name = "DSM5_collection"
file_path = '../data/DSM5.pdf'
doc_chunks = '../data/DSM_chunks'
build_vector_db(persist_directory, collection_name, file_path, doc_chunks)

In [7]:
vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
with open(doc_chunks, 'rb') as f: 
    documents = pickle.load(f) 

In [8]:
# Retrieve and generate using the relevant snippets of the blog.
vector_retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 4})

keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k =  3

retriever = EnsembleRetriever(
                            retrievers=[vector_retriever,keyword_retriever],
                            weights=[0.7, 0.3])


prompt = hub.pull("rlm/rag-prompt")

llm = ChatOpenAI(model="gpt-4o-mini")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



  prompt = loads(json.dumps(prompt_object.manifest))


In [9]:
question = 'What is the difference between bipolar disorder 1 and 2?'

In [10]:
# question = "What is ADHD?"
response = rag_chain.invoke(question)
print(response)

Bipolar disorder 1 is characterized by at least one manic episode, which may be preceded or followed by hypomanic or major depressive episodes. In contrast, bipolar disorder 2 involves at least one major depressive episode and at least one hypomanic episode, but does not include full-blown manic episodes. The key difference lies in the severity and presence of manic symptoms.


In [11]:
import fitz
import matplotlib.patches as patches
import matplotlib.pyplot as plt
from PIL import Image


def plot_pdf_with_boxes(pdf_page, segments):
    pix = pdf_page.get_pixmap()
    pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    fig, ax = plt.subplots(1, figsize=(10, 10))
    ax.imshow(pil_image)
    categories = set()
    category_to_color = {
        "Title": "orchid",
        "Image": "forestgreen",
        "Table": "tomato",
    }
    for segment in segments:
        points = segment["coordinates"]["points"]
        layout_width = segment["coordinates"]["layout_width"]
        layout_height = segment["coordinates"]["layout_height"]
        scaled_points = [
            (x * pix.width / layout_width, y * pix.height / layout_height)
            for x, y in points
        ]
        box_color = category_to_color.get(segment["category"], "deepskyblue")
        categories.add(segment["category"])
        rect = patches.Polygon(
            scaled_points, linewidth=1, edgecolor=box_color, facecolor="none"
        )
        ax.add_patch(rect)

    # Make legend
    legend_handles = [patches.Patch(color="deepskyblue", label="Text")]
    for category in ["Title", "Image", "Table"]:
        if category in categories:
            legend_handles.append(
                patches.Patch(color=category_to_color[category], label=category)
            )
    ax.axis("off")
    ax.legend(handles=legend_handles, loc="upper right")
    plt.tight_layout()
    plt.show()


def render_page(doc_list: list, page_number: int, print_text=True) -> None:
    pdf_page = fitz.open(file_path).load_page(page_number - 1)
    page_docs = [
        doc for doc in doc_list if doc.metadata.get("page_number") == page_number
    ]
    segments = [doc.metadata for doc in page_docs]
    plot_pdf_with_boxes(pdf_page, segments)
    if print_text:
        for doc in page_docs:
            print(f"{doc.page_content}\n")

INFO: Failed to extract font properties from /usr/share/fonts/google-noto-sans-mono-cjk-vf-fonts/NotoSansMonoCJK-VF.ttc: In FT2Font: Can not load face (SFNT font table missing; error code 0x8e)
INFO: Failed to extract font properties from /usr/share/fonts/google-noto-sans-cjk-vf-fonts/NotoSansCJK-VF.ttc: In FT2Font: Can not load face (SFNT font table missing; error code 0x8e)
INFO: Failed to extract font properties from /usr/share/fonts/google-noto-color-emoji-fonts/NotoColorEmoji.ttf: In FT2Font: Can not load face (unknown file format; error code 0x2)
INFO: Failed to extract font properties from /usr/share/fonts/google-noto-serif-cjk-vf-fonts/NotoSerifCJK-VF.ttc: In FT2Font: Can not load face (SFNT font table missing; error code 0x8e)
INFO: Failed to extract font properties from /usr/share/fonts/abattis-cantarell-vf-fonts/Cantarell-VF.otf: In FT2Font: Can not load face (SFNT font table missing; error code 0x8e)
INFO: generated new fontManager


In [10]:
!pip install -qU matplotlib PyMuPDF pillow

# Determining ideal chunk size

In [18]:
print(text)

38
Neurodevelopmental Disorders
factors (e.g., sensory impairment, severe problem behavior), the individual may be diag-
nosed with unspecified intellectual disability. Adaptive functioning may be difficult to
assess in a controlled setting (e.g., prisons, detention centers); if possible, corroborative in-
formation reflecting functioning outside those settings should be obtained.
Criterion B is met when at least one domain of adaptive functioning—conceptual, so-
cial, or practical—is sufficiently impaired that ongoing support is needed in order for the
person to perform adequately in one or more life settings at school, at work, at home, or in
the community. To meet diagnostic criteria for intellectual disability, the deficits in adap-
tive functioning must be directly related to the intellectual impairments described in Cri-
terion A. Criterion C, onset during the developmental period, refers to recognition that
intellectual and adaptive deficits are present during childhood or adole

In [31]:
docs.metadata

{'format': 'PDF 1.6',
 'title': '',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': '',
 'producer': '',
 'creationDate': '',
 'modDate': '',
 'trapped': '',
 'encryption': None}

In [32]:
docs.metadata['title']

''

In [33]:
len(docs)

992

In [28]:
docs.get_toc()

[]

In [34]:
docs[0]

page 0 of DSM5_chatbot/data/DSM5.pdf

In [38]:
p = docs.load_page(100)

In [41]:
p.get_text()

'56\nNeurodevelopmental Disorders\nsecond birthday (see also Rett syndrome in the section “Differential Diagnosis” for this\ndisorder).\nFirst symptoms of autism spectrum disorder frequently involve delayed language de-\nvelopment, often accompanied by lack of social interest or unusual social interactions (e.g.,\npulling individuals by the hand without any attempt to look at them), odd play patterns\n(e.g., carrying toys around but never playing with them), and unusual communication\npatterns (e.g., knowing the alphabet but not responding to own name). Deafness may be\nsuspected but is typically ruled out. During the second year, odd and repetitive behaviors\nand the absence of typical play become more apparent. Since many typically developing\nyoung children have strong preferences and enjoy repetition (e.g., eating the same foods,\nwatching the same video multiple times), distinguishing restricted and repetitive behav-\niors that are diagnostic of autism spectrum disorder can be dif

In [44]:
i

0

In [49]:
text

'56\nNeurodevelopmental Disorders\nsecond birthday (see also Rett syndrome in the section “Differential Diagnosis” for this\ndisorder).\nFirst symptoms of autism spectrum disorder frequently involve delayed language de-\nvelopment, often accompanied by lack of social interest or unusual social interactions (e.g.,\npulling individuals by the hand without any attempt to look at them), odd play patterns\n(e.g., carrying toys around but never playing with them), and unusual communication\npatterns (e.g., knowing the alphabet but not responding to own name). Deafness may be\nsuspected but is typically ruled out. During the second year, odd and repetitive behaviors\nand the absence of typical play become more apparent. Since many typically developing\nyoung children have strong preferences and enjoy repetition (e.g., eating the same foods,\nwatching the same video multiple times), distinguishing restricted and repetitive behav-\niors that are diagnostic of autism spectrum disorder can be dif

In [47]:
doc

{'format': 'PDF 1.6',
 'title': '',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': '',
 'producer': '',
 'creationDate': '',
 'modDate': '',
 'trapped': '',
 'encryption': None}