In [12]:
print("testing")

testing


In [13]:
%pwd

'/Users/georgelin/Desktop/Projects/medical_chat_LLM/research'

In [14]:
import os
from pathlib import Path

# Simple fix: find project root from current location
current_dir = Path.cwd()
print(f"Current directory: {current_dir}")

# If we're in research folder, go up one level to project root
if current_dir.name == 'research':
    project_root = current_dir.parent
else:
    # If not in research, try to find it by looking for data folder
    project_root = current_dir
    # Walk up until we find the data folder
    for parent in [current_dir] + list(current_dir.parents):
        if (parent / 'data').exists() and (parent / 'app.py').exists():
            project_root = parent
            break

print(f"Project root: {project_root}")
os.chdir(str(project_root))

Current directory: /Users/georgelin/Desktop/Projects/medical_chat_LLM/research
Project root: /Users/georgelin/Desktop/Projects/medical_chat_LLM


In [15]:
%pwd

'/Users/georgelin/Desktop/Projects/medical_chat_LLM'

In [16]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [17]:
# extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [18]:
# Verify data folder exists and load PDFs
data_path = "data"
if os.path.exists(data_path):
    print(f"✅ Found data directory: {os.path.abspath(data_path)}")
    pdf_files = [f for f in os.listdir(data_path) if f.endswith('.pdf')]
    print(f"PDF files in data: {pdf_files}")
    extracted_data = load_pdf_files(data_path)
    print(f"✅ Successfully loaded {len(extracted_data)} documents")
else:
    print(f"❌ Data directory not found at: {os.path.abspath(data_path)}")
    print(f"Current directory: {os.getcwd()}")
    print(f"Available items: {os.listdir('.')}")

✅ Found data directory: /Users/georgelin/Desktop/Projects/medical_chat_LLM/data
PDF files in data: ['5812IFU.pdf']
✅ Successfully loaded 500 documents


In [19]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [20]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [21]:
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    text_chunks = text_splitter.split_documents(minimal_docs)
    return text_chunks

In [22]:
texts_chunk = text_split(minimal_docs)
print(f"num of chunks: {len(texts_chunk)}")

num of chunks: 1611


In [23]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """ 
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name = model_name
    )
    return embeddings

embedding = download_embeddings()

In [24]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [25]:
vector = embedding.aembed_query("Hello world")

In [26]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [27]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [28]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [29]:
from pinecone import ServerlessSpec

index_name = "medical-lab-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,
        metric="cosine",
        spec = ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

In [30]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

  return forward_call(*args, **kwargs)


In [31]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [None]:
# option to add more data to existing data index
"""
dswith = Document(
    page_content="placeholder for future documents",
    metadata={"source": "lab"}
)

docsearch.add_documents(documents=[dswith])
"""


In [32]:
retriever = docsearch.as_retriever(search_type = "similarity", search_kwarts={"k":3})

In [33]:
retrived_docs = retriever.invoke("How do I order QC?")
retrived_docs

  return forward_call(*args, **kwargs)


[Document(id='1f238c1f-589d-42ea-a466-e15a2b046a06', metadata={'source': 'data/5812IFU.pdf'}, page_content='when there is no order (requisition) available for a sample, for example with a\nsample ID read error. In the QC tab, you can program default QC profiles\n(numbers 87 to 98) for each sample type and Group. The default QC profile is\nthe automatic QC order (requisition) made after a reagent check.\nd. In Profile Name, select a profile.\ne. Select the test. The system displays the selected tests in blue.\nf. Confirm that the information is correct, and then select Confirm (F1).'),
 Document(id='62c42e98-896c-436b-bc01-eb3334ae397c', metadata={'source': 'data/5812IFU.pdf'}, page_content='Figure 2.34 Rack Requisition: QC Screen\n2 In Type, select the sample type.\nThe system displays the tests automatically ordered (requisitioned) for QC in blue.\nNOTE\nTests are automatically ordered (requisitioned) for QC after the following:\n— You perform a reagent check. This orders (requisition

In [34]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI(model="gpt-5-mini")



In [35]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [36]:
system_prompt = (
    "You are an expert in medical laboratory analyzers for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use bullets or complete sentences depending on the question and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [37]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [41]:
response = rag_chain.invoke({"input": "How do I program a custom reagent?"})

print(response["answer"])

  return forward_call(*args, **kwargs)


Brief step-by-step (based on the analyzer reagent-management functions and reagent constraints):

- Open the Reagent Management / Reagent Setting screen (Reagent Management main tab).
- Create a new reagent entry (or edit an existing one).
- Set the Reagent Setting Method to "Turn table method."
- Select reagent Type: Normal concentration reagent or Highly concentrated reagent.
- Set the Number of Reagent Steps (1–3).
- Enter reagent volumes for each step in 1 µL increments. Observe these limits:
  - Normal dispensing range: 10–170 µL per dispense.
  - R1 ≤ 170 µL; R2 ≤ 170 µL.
  - Total R1 + R2 ≤ 270 µL.
  - For 3‑step reagents: R1‑1 + R1‑2 ≤ 170 µL.
- Assign the physical bottle position on the turn table and enter ID/lot/expiry information.
- Save the reagent definition.
- Place the bottle in the assigned position and run Reagent Check (F5) — the system will detect the bottle, read the reagent ID and calculate remaining volume.
- Note: the system dispenses with a micro‑syringe that h