In [1]:
from dotenv import load_dotenv

load_dotenv()

True

### Import

In [2]:
from llama_index.core import VectorStoreIndex, StorageContext, SimpleDirectoryReader
# from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.node_parser import SentenceSplitter
import logging

logging.basicConfig(level=logging.INFO)

### Load Documents

Using `SimpleDirectoryReader` to load pdfs and docx.

In [3]:
DATA_DIR = "/home/kris/dev/syenza-docs/pdfs"

documents = SimpleDirectoryReader(DATA_DIR).load_data()

# nodes = SentenceSplitter().get_nodes_from_documents(documents)

Trying all the documents to see how long it takes (and if it's possible)

In [4]:
len(documents)

885

I guess I've got all of them... DO I? How do I ensure?

### Defining Text Splitters

In [11]:
from typing import Sequence
from llama_index.core.schema import BaseNode

def documents_to_nodes(documents, chunk_size: int = 1024, chunk_overlap: int = 200) -> Sequence[BaseNode]:
    splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) 
    nodes = splitter.get_nodes_from_documents(documents)
    
    return nodes

In [6]:
nodes1024 = documents_to_nodes(documents)
len(nodes1024)

1365

In [12]:
test_nodes512 = documents_to_nodes(documents, chunk_size=512, chunk_overlap=50)
len(test_nodes512)

2401

In [13]:
test_nodes256 = documents_to_nodes(documents, chunk_size=256, chunk_overlap=25)
len(test_nodes256)

4859

## Ingestion with Qdrant

Important: We create collections only once during the Ingestion.

In [7]:
from qdrant_client import QdrantClient

client = QdrantClient(url="http://localhost:6333")

In [8]:
from typing import Sequence
from llama_index.core.schema import BaseNode

def create_index(nodes: Sequence[BaseNode], colection_name: str):
    vector_store = QdrantVectorStore(colection_name, client=client)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex(nodes=nodes, storage_context=storage_context)

In [12]:
def inject_docs(documents, chunk_size, chunk_overlap):
    nodes = documents_to_nodes(documents, chunk_size, chunk_overlap)
    collection_name = "mrvs-chunks-{}-{}".format(chunk_size, chunk_overlap)
    print(f"Generating index for collection: {collection_name}")
    create_index(nodes, collection_name) 

In [13]:
sizes_and_overlaps = [
    # (1024,200),
    (1024,100),
    # (512,100),
    (512,50),
    # (256,50),
    (256,25)
]

for cs, co in sizes_and_overlaps:
    inject_docs(documents, cs, co)

INFO:httpx:HTTP Request: GET http://localhost:6333/collections/mrvs-test-1024-100/exists "HTTP/1.1 200 OK"


Generating index for collection: mrvs-test-1024-100


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embedding

Generating index for collection: mrvs-test-512-50


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embedding

Generating index for collection: mrvs-test-256-25


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embedding

## Create Collections

### Chunks 1024

In [11]:
create_index(nodes1024, "chunks1024")

INFO:httpx:HTTP Request: GET http://localhost:6333/collections/chunks1024/exists "HTTP/1.1 200 OK"


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embedding

### Chunks 512

In [14]:
create_index(test_nodes512, "chunks512")

INFO:httpx:HTTP Request: GET http://localhost:6333/collections/chunks512/exists "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com

### Chunks 256

In [15]:
create_index(test_nodes256, "chunks256")

INFO:httpx:HTTP Request: GET http://localhost:6333/collections/chunks256/exists "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com

### Load Existing index

Really struggled to find out how tou create `index` from a `vector_store`.

Found the solutions here in [docs](https://docs.llamaindex.ai/en/stable/understanding/storing/storing/#:%7E:text=Important%3A%20if%20you%20had%20initialized%20your%20index%20with%20a%20custom%20ServiceContext%20object%2C%20you%20will%20need%20to%20pass%20in%20the%20same%20ServiceContext%20during%20load_index_from_storage%2C%20or%20have%20it%20set%20as%20the%20global%20service%20context.).

To be more specific, this was the code block:
```python
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

# initialize client
db = chromadb.PersistentClient(path="./chroma_db")

# get collection
chroma_collection = db.get_or_create_collection("quickstart")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# load your index from stored vectors
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context
)

# create a query engine
query_engine = index.as_query_engine()
response = query_engine.query("What is llama2?")
print(response)
```

In [5]:
vector_store = QdrantVectorStore("chunks1024", client=client)
index = VectorStoreIndex.from_vector_store(vector_store)


INFO:httpx:HTTP Request: GET http://localhost:6333/collections/chunks1024/exists "HTTP/1.1 200 OK"


### Index to Query Engine

In [24]:
query_engine = index.as_query_engine()

In [25]:
response = query_engine.query("What is Mitris Resilia?")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/chunks1024/points/search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [28]:
response.response

'Mitris Resilia is a mitral valve designed for use in replacement of native or prosthetic mitral heart valves. It is built on the Carpentier-Edwards PERIMOUNT valve platform and incorporates features to enhance ease of implantation. The valve is designed to offer enhanced tissue anticalcification technology that potentially allows it to last longer.'

### Index to Retriever

In [34]:
retriever = index.as_retriever()
returned_nodes = retriever.retrieve("What is Mitris Resilia?")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/chunks1024/points/search "HTTP/1.1 200 OK"


In [33]:
for node in returned_nodes:
    print(node.text)
    print("--"*25)

MITRIS RESILIA Mitral Valve—
Features and Benefits
The MITRIS RESILIA mitral valve is the third product in a family of 
RESILIA tissue valves brought to you by Edwards Lifesciences.
The MITRIS RESILIA mitral valve is specifically designed for the 
mitral position.  
Indications:  For use in replacement of native or prosthetic mitral 
heart valves.
Enhanced  delivery 
experience
Posteromedial commissure 
mark (single black line), 
anterolateral commissure 
mark (double black line), 
and an anterior segment 
mark ("A" mark). The black 
commissure markers 
facilitate the orientation 
of the valve and help avoid 
obstruction of the left 
ventricular outflow tract by 
stent postsBuilt on the Carpentier-
Edwards PERIMOUNT valve 
platform—a platform with 
over 20 years of published 
clinical durability1
Nitinol stents fold down 
to 55°, allowing for ease 
of implantation; stents 
return to their original 
position when the valve is 
implantedHighlights
The MITRIS valve is built on the Carpent

#### Using `VectorIndexRetriever`

In [36]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

index_retriever = VectorIndexRetriever(index, similarity_top_k=3)
query_eng_retriever = RetrieverQueryEngine(index_retriever)

In [38]:
response = query_eng_retriever.query("What is mitris resilia?")
print(response.response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/chunks1024/points/search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


MITRIS RESILIA is a mitral valve designed for the replacement of native or prosthetic mitral heart valves. It is indicated for single use only and should not be resterilized or reused. The valve should not be exposed to extreme temperatures, damaged during insertion, or exposed to solutions other than sterile physiological saline solution. Additionally, precautions should be taken to avoid leaflet tissue damage, oversizing, and passing surgical instruments across the valve. The MITRIS RESILIA valve is built on the Carpentier-Edwards PERIMOUNT valve platform and incorporates features to enhance ease of implantation and offer enhanced tissue anticalcification technology.


In [39]:
retrieved_nodes = query_eng_retriever.retrieve("What is Mitris Resilia?")
for node in retrieved_nodes:
    print(node.text)
    print ("--"*25)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/chunks1024/points/search "HTTP/1.1 200 OK"


MITRIS RESILIA Mitral Valve—
Features and Benefits
The MITRIS RESILIA mitral valve is the third product in a family of 
RESILIA tissue valves brought to you by Edwards Lifesciences.
The MITRIS RESILIA mitral valve is specifically designed for the 
mitral position.  
Indications:  For use in replacement of native or prosthetic mitral 
heart valves.
Enhanced  delivery 
experience
Posteromedial commissure 
mark (single black line), 
anterolateral commissure 
mark (double black line), 
and an anterior segment 
mark ("A" mark). The black 
commissure markers 
facilitate the orientation 
of the valve and help avoid 
obstruction of the left 
ventricular outflow tract by 
stent postsBuilt on the Carpentier-
Edwards PERIMOUNT valve 
platform—a platform with 
over 20 years of published 
clinical durability1
Nitinol stents fold down 
to 55°, allowing for ease 
of implantation; stents 
return to their original 
position when the valve is 
implantedHighlights
The MITRIS valve is built on the Carpent