In [1]:
import argparse
import pathlib
from typing import List

import fitz  # PyMuPDF
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.vectorstores import Chroma

from langchain.text_splitter import MarkdownTextSplitter, MarkdownHeaderTextSplitter

import pymupdf4llm

from unstructured.partition.pdf import partition_pdf

from langchain_unstructured import UnstructuredLoader

from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title

from langchain_chroma import Chroma


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# pdf_path = "/Users/l.yao/Documents/MedDoc/local/Leave-Policy-Chapter-2-Annual-Leave-Procedure-V4-Rev-Jan2025.pdf"
pdf_path = "/Users/l.yao/Documents/MedDoc/local/Leave-Policy-Framework-W19-PAG-Nov24-Final-v3.pdf"

In [None]:
# loader = UnstructuredLoader(
#     file_path=[pdf_path],
#     strategy="auto",
#     partition_via_api=False
# )
# docs = loader.load()
# 
# [doc.page_content for doc in docs]

In [None]:
elements = partition_pdf(filename=pdf_path, strategy="hi_res")

In [None]:
# ocr_elements = partition_pdf(filename=pdf_path, strategy="ocr_only")
# fast_elements = partition_pdf(filename=pdf_path, strategy="fast")

In [None]:
chunks = chunk_by_title(
    elements,
    max_characters=1000,
    combine_text_under_n_chars=500,
    multipage_sections=False,
    overlap=200,
)

In [None]:
chunks[0].metadata.to_dict()

In [None]:
# Parse chunks into texts

texts = []
metadatas = []

for chunk in chunks:
    text = chunk.text
    metadata = chunk.metadata.to_dict()
    metadata = {
        "filename": metadata["filename"],
        "page_number": metadata["page_number"],
    }
    texts.append(text)
    metadatas.append(metadata)

db = Chroma.from_texts(
    texts=texts,
    metadatas=metadatas,
    embedding=embeddings,
    persist_directory=CHROMA_PATH  # Optional: folder to save the DB
)

In [None]:
docs = db.similarity_search("How many days in advance to take adoption leave?", k=3)

In [None]:
for doc in docs:
    print(doc.page_content)
    print(doc.metadata)
    print("\n\n" + "-"*80)

In [None]:
from langchain.schema import HumanMessage, SystemMessage
import textwrap
import os

from langchain_core.messages.utils import count_tokens_approximately

from openai import OpenAI


CHROMA_PATH = "/Users/l.yao/Documents/MedDoc/backend/ingestion/chroma_db"

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)
question = "How many days in advance to ask for adoption leave?"

docs = db.similarity_search(question, k=3)

context = '\n\n'.join(f"{doc.page_content} + \n {doc.metadata}" for doc in docs)

system_prompt = (
    "You are an HR assistant for hospital staff. "
    "Answer the user question based solely on the provided HR policy context. "
    "If the context does not contain the answer, reply that you do not know instead of inventing one."
    "When you have the answer, please state the answer by adhering closely to the original text, and do not add any additional information."
    "At the end of your answer, cite the page number and document name of the text that you used to answer the question."
)

messages = [
    SystemMessage(content=system_prompt),
    HumanMessage(content=textwrap.dedent(f"""
        Context:
        {context}

        Question: {question}
    """)),
]

print(count_tokens_approximately(messages))
# calculate how many tokens are in the query

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-3.5-turbo")

from langchain.chat_models import init_chat_model

model = init_chat_model(
    OPENAI_MODEL,
    model_provider="openai",
    api_key=OPENAI_API_KEY,
)

In [None]:
docs

In [None]:
response = model.invoke(messages)

In [None]:
completion = client.chat.completions.create(
    model=OPENAI_MODEL,
    messages=messages,
    max_tokens=None,
)

completion

In [None]:
# Old test (didn't work very well).

doc = fitz.open(pdf_path)
md_text = pymupdf4llm.to_markdown(doc)

# print(md_text)

splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[("*", "Header 1"), ("**", "Header 2")])

split_text = splitter.split_text(md_text)