# Information
- Milvus running on local host. Need to change it when placing on cloud
- Only one PDF was loaded into milvus, so need to update it


### Information on architecture built
- MILVUS STORAGE name: `medical_knowledge_base_v2`
- embedding model: `sentence-transformers/all-MiniLM-L6-v2`

# ignore

In [None]:
'''from pathlib import Path
import os
import logging
from typing import List, Set, Dict, Any
import time

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Document loading
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Vector database
from pymilvus import connections, utility
from pymilvus.orm.collection import Collection
from pymilvus.exceptions import MilvusException
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Milvus

# Constants
MILVUS_COLLECTION_NAME = "medical_knowledge_base_v2"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200'''

In [None]:
'''def connect_to_milvus(host: str = "localhost", port: str = "19530") -> bool:
    """Establish connection to Milvus server with error handling."""
    try:
        # First disconnect to reset any existing connections
        try:
            connections.disconnect("default")
        except:
            pass  # Ignore if no connection exists
        
        # Connect to Milvus
        connections.connect(host=host, port=port)
        
        # Verify connection by listing collections
        utility.list_collections()
        logger.info(f"✅ Connected to Milvus server at {host}:{port}")
        return True
    except Exception as e:
        logger.error(f"❌ Failed to connect to Milvus: {str(e)}")
        return False'''

In [None]:
'''def load_pdfs(data_directory: str) -> List:
    """Load PDF documents with error handling."""
    try:
        # Find PDFs on disk
        pdf_files = list(Path(data_directory).rglob("*.pdf"))
        logger.info(f"Found {len(pdf_files)} PDF files:")
        for p in pdf_files:
            logger.info(f"  {p.name}")
        
        # Load every page as a Document
        loader = DirectoryLoader(
            data_directory, 
            glob="**/*.pdf", 
            loader_cls=PyMuPDFLoader)
        
        docs = loader.load()
        logger.info(f"✅ Loaded {len(docs)} pages from your PDFs")
        return docs
    except Exception as e:
        logger.error(f"❌ Error loading PDFs: {str(e)}")
        return []'''

In [None]:
'''def connect_to_milvus(host: str = "localhost", port: str = "19530") -> bool:
    """Establish connection to Milvus server with error handling."""
    try:
        # First disconnect to reset any existing connections
        try:
            connections.disconnect("default")
        except:
            pass  # Ignore if no connection exists
        
        # Connect to Milvus
        connections.connect(host=host, port=port)
        
        # Verify connection by listing collections
        utility.list_collections()
        logger.info(f"✅ Connected to Milvus server at {host}:{port}")
        return True
    except Exception as e:
        logger.error(f"❌ Failed to connect to Milvus: {str(e)}")
        return False'''

In [None]:
'''def get_existing_sources(collection_name: str) -> Set[str]:
    """Retrieve existing document sources from Milvus collection with pagination."""
    try:
        col = Collection(collection_name)
        col.load()
        
        # Get total count of entities
        count = col.num_entities
        logger.info(f"Total entities in collection: {count}")
        
        # Use pagination to handle large collections
        batch_size = 10000
        offset = 0
        all_sources = set()
        
        while offset < count:
            results = col.query(
                expr="",
                output_fields=["source"],
                limit=batch_size,
                offset=offset
            )
            sources = {row["source"] for row in results if "source" in row}
            all_sources.update(sources)
            offset += batch_size
            
        logger.info(f"✅ Retrieved {len(all_sources)} unique source paths from Milvus")
        return all_sources
    except Exception as e:
        logger.error(f"❌ Error retrieving existing sources: {str(e)}")
        return set()'''

In [None]:
'''def filter_and_chunk_documents(docs, existing_sources: Set[str], chunk_size: int, chunk_overlap: int):
    """Filter out already processed documents and chunk new ones."""
    try:
        # Filter out docs whose source PDF is already in Milvus
        new_docs = [doc for doc in docs if doc.metadata["source"] not in existing_sources]
        logger.info(f"🛡  Skipping {len(docs) - len(new_docs)} pages already uploaded")
        logger.info(f"🔄  Will process {len(new_docs)} new pages")
        
        if not new_docs:
            logger.info("No new documents to process.")
            return []
            
        # Chunk only new pages
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        chunks = splitter.split_documents(new_docs)
        logger.info(f"✅  Created {len(chunks)} chunks from new documents")
        return chunks
    except Exception as e:
        logger.error(f"❌ Error filtering and chunking documents: {str(e)}")
        return []'''

In [None]:
'''def add_to_vector_store(chunks, collection_name: str, embedding_model: str, host: str, port: str):
    """Add document chunks to Milvus vector store with error handling."""
    if not chunks:
        logger.info("No chunks to add to vector store.")
        return False
        
    try:
        # Initialize embedding model
        start_time = time.time()
        embedder = HuggingFaceEmbeddings(model_name=embedding_model)
        logger.info(f"Initialized embedder: {embedding_model}")
        
        try:
            # Try to connect to an existing Milvus collection
            vector_store = Milvus(
                embedding_function=embedder,
                collection_name=collection_name,
                connection_args={"host": host, "port": port}
            )
            vector_store.add_documents(chunks)
            logger.info(f"✅ Added {len(chunks)} chunks to existing collection '{collection_name}'")
        except Exception as e:
            logger.info(f"Creating new collection: {str(e)}")
            # If collection doesn't exist, create it and insert
            vector_store = Milvus.from_documents(
                documents=chunks,
                embedding=embedder,
                collection_name=collection_name,
                connection_args={"host": host, "port": port}
            )
            logger.info(f"✅ Created new collection '{collection_name}' with {len(chunks)} chunks")
        
        # Log performance metrics
        elapsed_time = time.time() - start_time
        logger.info(f"Processing took {elapsed_time:.2f} seconds ({len(chunks)/elapsed_time:.2f} chunks/second)")
        
        return True
    except Exception as e:
        logger.error(f"❌ Error adding chunks to vector store: {str(e)}")
        return False'''

In [None]:
'''def main():
    """Main function to orchestrate the PDF loading and vector storage process."""
    # Configuration - update these for cloud deployment
    data_directory = "/Users/brunamedeiros/Documents/University of Chicago/Spring 2025 - Capstone I/FINAL DATASET"
    host = "localhost"  # Change for cloud deployment
    port = "19530"      # Change for cloud deployment
    
    # Connect to Milvus
    if not connect_to_milvus(host, port):
        logger.error("Exiting due to Milvus connection failure")
        return
    
    # Load PDFs
    docs = load_pdfs(data_directory)
    if not docs:
        logger.error("Exiting due to PDF loading failure")
        return
    
    # Get existing sources from Milvus
    existing_sources = get_existing_sources(MILVUS_COLLECTION_NAME)
    
    # Filter and chunk documents
    chunks = filter_and_chunk_documents(docs, existing_sources, CHUNK_SIZE, CHUNK_OVERLAP)
    
    # Add to vector store
    success = add_to_vector_store(chunks, MILVUS_COLLECTION_NAME, EMBEDDING_MODEL, host, port)
    
    # Confirm total stored vectors
    try:
        collection = Collection(MILVUS_COLLECTION_NAME)
        logger.info(f"✅ Total vectors stored: {collection.num_entities}")
    except Exception as e:
        logger.error(f"❌ Error checking collection size: {str(e)}")
'''

In [None]:
'''if __name__ == "__main__":
    main()'''

--- 
# Loading PDFs

In [1]:
from pathlib import Path
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Embed + Store in Milvus
from pymilvus import connections
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Milvus
from pymilvus import connections, utility
from pymilvus import Collection

# Visualization 
from pymilvus.orm.collection import Collection
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA

In [2]:
data_directory = "/Users/Kyler/Codestuff/Capstone/papers"
collection_name = "medical_knowledge_base_v1"

In [3]:
# Find PDFs on disk
pdf_files = list(Path(data_directory).rglob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files:")
for p in pdf_files:
    print("  ", p.name)

Found 30 PDF files:
   LI-RADS US Surveillance v2024 Core.pdf
   Diagnostic Imaging Genitourinary ( PDFDrive ).pdf
   Radiology Illustrated_ Hepatobiliary and Pancreatic Radiology ( PDFDrive ).pdf
   Diagnostic Imaging_ Abdomen_ Published by Amirsys® ( PDFDrive ).pdf
   fujita-et-al-2019-hyperintense-liver-masses-at-hepatobiliary-phase-gadoxetic-acid-enhanced-mri-imaging-appearances-and.pdf
   LI-RADS CTMR Radiation TRA v2024 Core.pdf
   Liver imaging _ MRI with CT correlation ( PDFDrive ).pdf
   Reporting _ RADS - Reporting and Data Systems Support.pdf
   LIRADS Lexicon Table.pdf
   Getting Started _ RADS - Reporting and Data Systems Support.pdf
   RoleofMRIinEvaluationofSpectrumofLiverLesionsinCirrhoticPatients-JAPI.pdf
   Imaging Features _ RADS - Reporting and Data Systems Support.pdf
   CT and MRI of the Whole Body, 2-Volume Set, 6e, Volume I ( PDFDrive ).pdf
   Mayo Clinic Gastrointestinal Imaging Review ( PDFDrive ).pdf
   Management _ RADS - Reporting and Data Systems Support.p

In [5]:
# Load every page as a Document
loader = DirectoryLoader(
    data_directory, 
    glob="**/*.pdf", 
    loader_cls=PyMuPDFLoader)
docs = loader.load()
print(f"\nLoaded {len(docs)} pages from your PDFs")


Loaded 7520 pages from your PDFs


In [6]:
# Convert full path into just filename
for doc in docs:
    path = Path(doc.metadata["source"])
    doc.metadata["source"] = path.name

# Embeddings

In [7]:
# Connect to Milvus
connections.disconnect("default")  # force reset
connections.connect(host="localhost", port="19530")
#print(utility.list_collections())  # confirm it

- pull all existing `source` field values from Milvus collection
- Drop any page whoe `doc.metadata["source"]` matches one of those, to never re-add the same PDF page twice
- split *only* new pages into `chunks`

In [8]:
#  Deduplication: only keep docs with new 'source'
if utility.has_collection(collection_name):
    col      = Collection(collection_name)
    existing = col.query(expr="", output_fields=["source"], limit=16384)
    seen     = {r["source"] for r in existing}
    new_docs = [doc for doc in docs if doc.metadata["source"] not in seen]
    print(f"🛡  Skipping {len(docs) - len(new_docs)} pages already uploaded")
else:
    new_docs = docs
    print("⚡ No existing collection — processing all pages")

⚡ No existing collection — processing all pages


In [9]:
'''# query existing 'source' metadata from Milvus
col = Collection("medical_knowledge_base_v1")
existing = col.query(expr="", output_fields=["source"], limit=100000)
already_uploaded_sources = {row["source"] for row in existing}

# filter out docs whose source PDF is already in Milvus
new_docs = [doc for doc in docs if doc.metadata["source"] not in already_uploaded_sources]
print(f"🛡  Skipping {len(docs) - len(new_docs)} pages already uploaded")
print(f"🔄  Will re-chunk {len(new_docs)} new pages")'''

'# query existing \'source\' metadata from Milvus\ncol = Collection("medical_knowledge_base_v1")\nexisting = col.query(expr="", output_fields=["source"], limit=100000)\nalready_uploaded_sources = {row["source"] for row in existing}\n\n# filter out docs whose source PDF is already in Milvus\nnew_docs = [doc for doc in docs if doc.metadata["source"] not in already_uploaded_sources]\nprint(f"🛡  Skipping {len(docs) - len(new_docs)} pages already uploaded")\nprint(f"🔄  Will re-chunk {len(new_docs)} new pages")'

In [10]:
# re-chunk only new pages
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(new_docs)
print(f"🔄  {len(chunks)} chunks ready for ingestion")

🔄  20021 chunks ready for ingestion


In [11]:
# Initialize Hugging Face embedder
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

try:
    # Try to connect to an existing Milvus collection
    vector_store = Milvus(
        embedding_function=embedder,
        collection_name=collection_name,
        connection_args={"host": "localhost", "port": "19530"}
    )
    vector_store.add_documents(chunks)
    print(f"✅ Added {len(chunks)} chunks to existing collection '{collection_name}'.")
except Exception:
    # If collection doesn't exist, create it and insert
    vector_store = Milvus.from_documents(
        documents=chunks,
        embedding=embedder,
        collection_name=collection_name,
        connection_args={"host": "localhost", "port": "19530"}
    )
    print(f"✅ Created new collection '{collection_name}' with {len(chunks)} chunks.")

✅ Created new collection 'medical_knowledge_base_v1' with 20021 chunks.


In [12]:
# Confirm total stored vectors
collection = Collection("medical_knowledge_base_v1")
print("✅ Total vectors stored:", collection.num_entities)

✅ Total vectors stored: 13000


In [13]:
# Sanity check: name of PDFs processed INTO milvus
results = collection.query(expr="", output_fields=["source"], limit=16_384)
pdfs = sorted({r["source"] for r in results})
print("Processed PDFs:")
for pdf in pdfs:
    print(" -", pdf)

Processed PDFs:
 - CT and MRI of the Whole Body, 2-Volume Set, 6e, Volume I ( PDFDrive ).pdf
 - Diagnostic Categories _ RADS - Reporting and Data Systems Support.pdf
 - Diagnostic Imaging Genitourinary ( PDFDrive ).pdf
 - Diagnostic Imaging_ Abdomen_ Published by Amirsys® ( PDFDrive ).pdf
 - Diagnostic Imaging_ Gastrointestinal ( PDFDrive ).pdf
 - Gastrointestinal Imaging_ The Requisites (Requisites in Radiology) 3rd ed ( PDFDrive ).pdf
 - Genitourinary Radiology_ Radiology Requisites Series ( PDFDrive ) (1).pdf
 - Getting Started _ RADS - Reporting and Data Systems Support.pdf
 - Imaging Features _ RADS - Reporting and Data Systems Support.pdf
 - LI-RADS 2018 Core.pdf
 - LI-RADS CTMR Nonradiation TRA v2024 Core.pdf
 - LI-RADS CTMR Radiation TRA v2024 Core.pdf
 - LI-RADS US Surveillance v2024 Core.pdf
 - LIRADS Lexicon Table.pdf
 - Liver imaging _ MRI with CT correlation ( PDFDrive ).pdf
 - Management _ RADS - Reporting and Data Systems Support.pdf
 - Mayo Clinic Gastrointestinal Imagi