## Milvus

In [None]:
import pymilvus
from pymilvus import MilvusClient
from pymilvus import connections
import ollama

In [8]:
client = MilvusClient("milvus_demo.db")

#### Create Collection

In [9]:
if client.has_collection(collection_name="demo_collection"):
    client.drop_collection(collection_name="demo_collection")

client.create_collection(
    collection_name="demo_collection",
    dimension=768
)

In [None]:
connections.connect(alias="default", host="127.0.0.1", port="19530")

In [2]:
collection_name = "demo"
dim = 800
collection = create_milvus_collection(collection_name, dim)

In [3]:
collection.describe

<bound method Collection.describe of <Collection>:
-------------
<name>: demo
<description>: Document embeddings
<schema>: {'auto_id': True, 'description': 'Document embeddings', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 800}}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 500}}], 'enable_dynamic_field': False}
>

#### List DB

In [8]:
from pymilvus import db, connections, Collection, utility
connections.connect(alias="default", host="127.0.0.1", port="19530")
db.list_database()

['default']

In [10]:
utility.list_collections

<function pymilvus.orm.utility.list_collections(timeout: Optional[float] = None, using: str = 'default') -> list>

### Test functions

In [116]:
import numpy as np
from pymilvus import connections, FieldSchema, CollectionSchema, Collection, DataType
from sentence_transformers import SentenceTransformer

def create_milvus_collection(collection_name, dim):
    connections.connect(alias="default", host="127.0.0.1", port="19530")
    utility.drop_collection(collection_name)

    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, enable_dynamic_field=True, auto_id=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535)
    ]

    schema = CollectionSchema(fields, description="Document embeddings")
    collection = Collection(name=collection_name, schema=schema)

    index_params = {
        "metric_type":"L2",
        "index_type":"IVF_FLAT",
        "params":{"nlist":1024}
        }
    collection.create_index(
        field_name="embedding", 
        index_params=index_params
        )
    return collection



def load_documents_to_milvus(collection, documents, embedder_model="all-MiniLM-L6-v2"):
    embedder = SentenceTransformer(embedder_model)
    texts = [doc.page_content for doc in documents]
    embeddings = embedder.encode(texts)

    # print(type(embeddings))
    # embeddings = np.array(embeddings, dtype=np.float32).tolist()
   
    # Prepare data for insertion
    data = [
        # [i for i in range(len(texts))],  # Auto-generated IDs
        embeddings,
        texts
    ]

    # data = [
    #     {"name": "id",  "type":DataType.INT64, "values": [i for i in range(len(texts))]},
    #     {"name":"embedding", "type": DataType.FLOAT_VECTOR, "values":embeddings}
    # ]
    # data = [
    #     {"id": i, "embedding": embeddings[i]} for i in range(len(texts)) # Auto-generated IDs
    # ]
    # data = [
    #     [i for i in range(len(texts))],  # Auto-generated IDs
    #     embeddings
    # ]
    collection.insert(data)
    collection.load()

In [117]:
import os
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter


local_folder = "docs"
embedder_model = "all-MiniLM-L6-v2"
collection_name = "demo"

all_docs = []

for filename in os.listdir(local_folder):
    file_path = os.path.join(local_folder, filename)

    if os.path.isfile(file_path):
        try:
            loader = TextLoader(file_path)
            documents = loader.load()

            if documents:
                splitter = CharacterTextSplitter(chunk_size=650, chunk_overlap=50)
                docs = splitter.split_documents(documents)
                all_docs.extend(docs)
        except Exception as e:
            print(f"Could not process given document: {filename}")
            # st.error(f"Error loading file {filename}: {str(e)}")


# print(all_docs[0].page_content)
# # Connect to Milvus and create the collection
collection = create_milvus_collection(collection_name, dim=384)

# # Insert documents into Milvus
load_documents_to_milvus(collection, all_docs, embedder_model)

Could not process given document: CV_Karan_Yadav.pdf


In [126]:
query_results = collection.query(
    expr="",
    output_fields=["id", "embedding", "text"],
    limit=5  # Specify the number of results to retrieve
)

# Display the query results
for result in query_results:
    print(result["embedding"])
    print(result["text"])
    print()

[np.float32(0.023330953), np.float32(-0.08322821), np.float32(0.05691806), np.float32(-0.014981589), np.float32(0.023899054), np.float32(-0.010970449), np.float32(-0.047621105), np.float32(0.0068928795), np.float32(0.027451416), np.float32(-0.02604323), np.float32(-0.04605433), np.float32(0.006422947), np.float32(0.036969822), np.float32(0.0040245354), np.float32(0.0058321357), np.float32(0.044590637), np.float32(0.10430169), np.float32(0.04001954), np.float32(-0.07911644), np.float32(-0.09331284), np.float32(0.056112804), np.float32(0.08929409), np.float32(-0.007907887), np.float32(-0.0032908088), np.float32(0.0028501258), np.float32(0.052074444), np.float32(-0.00043003337), np.float32(0.009256244), np.float32(0.05408881), np.float32(-0.043794878), np.float32(0.055130575), np.float32(0.051674724), np.float32(0.01822354), np.float32(0.116131715), np.float32(-0.06061761), np.float32(0.06370466), np.float32(-0.10248386), np.float32(-0.027710002), np.float32(0.014741023), np.float32(-0.00

### Milvus retriever

In [20]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Milvus
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA

In [None]:
def create_milvus_retriever(
    collection_name,
    milvus_host="127.0.0.1",
    milvus_port="19530",
    embedder_model="all-MiniLM-L6-v2",
):
    try:
        embeddings = HuggingFaceEmbeddings(model_name=embedder_model)

        retriever = Milvus(
            embedding_function=embeddings,
            collection_name=collection_name,
            connection_args={"host": milvus_host, "port": milvus_port},
            vector_field="embedding"
        )

        return retriever
    except Exception as e:
        print(f"Error creating Milvus retriever: {e}")
        return None

In [18]:
retriever = create_milvus_retriever("multi_file_collection")

In [22]:
retriever.as_retriever()

VectorStoreRetriever(tags=['Milvus', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.milvus.Milvus object at 0x17b39bdd0>, search_kwargs={})

In [None]:
prompt = ChatPromptTemplate.from_template(
            """Answer the following query based on given context.
                Think step by step before querying a detailed answer.
                <context>
                {context}
                </context>
                Query:{question}
                  """
        )

llm = Ollama(model="llama2")

# document_chain = create_stuff_documents_chain(llm, prompt)
# retrieval_chain = create_retrieval_chain(retriever, document_chain)
# # response = llm.generate({"prompt": prompt})
# response = retrieval_chain.invoke(f"question: {query}")

retrieval_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever.as_retriever(),  
    chain_type="stuff",  
    return_source_documents=True, 
)