# Run Milvus DB from Docker 

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")
os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [3]:
pdf_folder = "/Users/kamal/Desktop/AgenticAI/Uploads"
text_output_dir = "output/text"
image_output_dir = "output/images"
table_output_dir = "output/tables"

In [4]:
os.makedirs(text_output_dir, exist_ok=True)
os.makedirs(image_output_dir, exist_ok=True)
os.makedirs(table_output_dir, exist_ok=True)

In [5]:
import fitz
import pdfplumber
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document



In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

len(embedding_model.embed_query("hello AI"))

1536

In [7]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

In [8]:
def extract_images_from_pdf(pdf_path, base_filename):
    doc = fitz.open(pdf_path)
    for page_num, page in enumerate(doc):
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            if pix.n < 5:
                pix.save(f"{image_output_dir}/{base_filename}_p{page_num+1}_img{img_index+1}.png")
            else:
                pix = fitz.Pixmap(fitz.csRGB, pix)
                pix.save(f"{image_output_dir}/{base_filename}_p{page_num+1}_img{img_index+1}.png")
            pix = None
    doc.close()

In [9]:
def extract_tables_from_pdf(pdf_path):
    table_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for table in page.extract_tables():
                df = pd.DataFrame(table)
                table_text += df.to_string(index=False) + "\n\n"
    return table_text

In [10]:
from langchain_milvus import Milvus
from pymilvus import connections, Collection
import time

URI = "tcp://localhost:19530"

connections.connect("default", host="localhost", port="19530")

vector_store = Milvus(
    embedding_function=embedding_model,
    connection_args={"uri": URI},
)

2025-06-04 12:47:34,060 [DEBUG][_create_connection]: Created new connection using: fccd45bb651b4da882c6f9b34be68d6c (async_milvus_client.py:599)


In [11]:
collections_info = {
    "my_collection_flat": {
        "index_type": "FLAT",
        "params": {},
        "search_param": {"metric_type": "COSINE", "params": {}}
    },
    "my_collection_hnsw": {
        "index_type": "HNSW",
        "params": {"M": 8, "efConstruction": 64},
        "search_param": {"metric_type": "COSINE", "params": {"ef": 64}}
    },
    "my_collection_ivf": {
        "index_type": "IVF_FLAT",
        "params": {"nlist": 128},
        "search_param": {"metric_type": "COSINE", "params": {"nprobe": 10}}
    }
}

In [12]:
from pymilvus import Collection, FieldSchema, CollectionSchema, DataType, utility

def create_collection(name):
    fields = [
        FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=64),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1536),
        FieldSchema(name="metadata", dtype=DataType.JSON)
    ]
    schema = CollectionSchema(fields, description="PDF chunks embeddings")
    collection = Collection(name, schema)
    return collection

def create_index(collection, index_type: str, params: dict):
    collection.release()
    collection.drop_index()
    collection.create_index(
        field_name="embedding",
        index_params={
            "index_type": index_type,
            "metric_type": "COSINE",
            "params": params
        }
    )
    collection.load()
    print(f"{collection.name} index {index_type} created.")

collections = {}
existing_collections = utility.list_collections()

for name, info in collections_info.items():
    if name in existing_collections:
        collections[name] = Collection(name)
    else:
        collections[name] = create_collection(name)
    create_index(collections[name], info["index_type"], info["params"])

my_collection_flat index FLAT created.
my_collection_hnsw index HNSW created.
my_collection_ivf index IVF_FLAT created.


In [13]:
for filename in os.listdir(pdf_folder):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, filename)
        base_name = os.path.splitext(filename)[0]
        print(f"📄 Processing: {filename}")

        text = extract_text_from_pdf(pdf_path)
        # tables = extract_tables_from_pdf(pdf_path)
        full_text = text

        with open(f"{text_output_dir}/{base_name}.txt", "w", encoding="utf-8") as f:
            f.write(full_text)

        extract_images_from_pdf(pdf_path, base_name)

        chunks = text_splitter.split_text(full_text)

        docs = [
            Document(
                page_content=chunk,
                metadata={"file_name": filename, "chunk_id": f"{base_name}_{i}"}
            )
            for i, chunk in enumerate(chunks)
        ]

        texts = [doc.page_content for doc in docs]
        embeddings = embedding_model.embed_documents(texts)
        ids = [doc.metadata["chunk_id"] for doc in docs]

        for collection in collections.values():
            collection.insert([
            ids,
            embeddings,
            [
                {
                    "file_name": filename,
                    "chunk_id": chunk_id,
                    "text": chunk_text  # add the actual chunk text here!
                }
                for chunk_id, chunk_text in zip(ids, texts)
            ]
        ])

            collection.flush()
            collection.load()

        print(f"✅ {filename} -> {len(chunks)} chunks embedded and stored in all collections.")
        print(f"🔹 First chunk: {chunks[0][:60]}...")

print("🚀 Extraction and embedding complete.")

📄 Processing: llama2.pdf
✅ llama2.pdf -> 289 chunks embedded and stored in all collections.
🔹 First chunk: Llama 2: Open Foundation and Fine-Tuned Chat Models
Hugo Tou...
📄 Processing: VVKR - EPC Contract.pdf


IndexError: list index out of range

In [None]:
# Load collections
flat_col = Collection("my_collection_flat")
hnsw_col = Collection("my_collection_hnsw")
ivf_col = Collection("my_collection_ivf")

# Load into memory
flat_col.load()
hnsw_col.load()
ivf_col.load()

In [None]:
def search_collection(collection, query_vector, top_k=5):
    search_params = {
        "my_collection_flat": {"metric_type": "COSINE", "params": {}},
        "my_collection_hnsw": {"metric_type": "COSINE", "params": {"ef": 64}},
        "my_collection_ivf": {"metric_type": "COSINE", "params": {"nprobe": 10}},
    }

    start = time.time()
    results = collection.search(
        data=[query_vector],
        anns_field="embedding",        
        param=search_params[collection.name],
        limit=top_k,
        output_fields=["metadata"]      
    )
    end = time.time()
    
    return results[0], end - start

In [None]:
import numpy as np

query_vector = np.random.rand(1536).tolist()

# Search all 3 collections
flat_results, flat_time = search_collection(flat_col, query_vector)
hnsw_results, hnsw_time = search_collection(hnsw_col, query_vector)
ivf_results, ivf_time = search_collection(ivf_col, query_vector)

# Show results
print(f"FLAT time: {flat_time:.4f}s")
print(f"HNSW time: {hnsw_time:.4f}s")
print(f"IVF time: {ivf_time:.4f}s")

FLAT time: 0.0108s
HNSW time: 0.0034s
IVF time: 0.0031s


In [None]:
import numpy as np
from pymilvus import Collection

def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors."""
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def search_with_cosine_similarity(collection: Collection, query_vector, top_k=5):
    """Search a Milvus collection and print cosine similarity scores."""
    
    # Set search parameters based on collection name
    search_params_map = {
        "my_collection_flat": {"metric_type": "COSINE", "params": {}},
        "my_collection_hnsw": {"metric_type": "COSINE", "params": {"ef": 64}},
        "my_collection_ivf":  {"metric_type": "COSINE", "params": {"nprobe": 10}},
    }
    
    search_params = search_params_map.get(collection.name, {"metric_type": "COSINE", "params": {}})
    
    # Run the vector search
    results = collection.search(
        data=[query_vector],
        anns_field="embedding",
        param=search_params,
        limit=top_k,
        output_fields=["metadata", "embedding"]
    )
    
    # Process results
    print(f"\n📦 Collection: {collection.name}")
    for i, hit in enumerate(results[0]):
        metadata = hit.entity.get("metadata", "")
        embedding = hit.entity.get("embedding", None)
        
        if embedding:
            cos_sim = cosine_similarity(query_vector, embedding)
            print(f"Result {i+1}:")
            print(f" - Metadata (preview): {str(metadata)[:80]}...")
            print(f" - Cosine Similarity: {cos_sim:.4f}")
        else:
            print(f" - ⚠️ No embedding returned. You may need to fetch it manually by ID.")


In [None]:
from langchain_core.prompts import PromptTemplate

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful assistant. Use the following context from a PDF to answer the question.

Context:
{context}

Question:
{question}

Answer in clear, elaborated and concise language. If the answer is not in the context, say you don't know.
"""
)

In [None]:
def build_context_from_hits(hits, top_k=5):
    # Extract top-k metadata as context text
    return "\n\n".join([str(hit.entity.get("metadata", "")) for hit in hits[:top_k]])

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnableMap

llm = ChatOpenAI(model="gpt-4o-mini")

# Build the chain
qa_chain = prompt_template | llm

In [None]:
question = "How many parameters are present in Llama?"

embedder = OpenAIEmbeddings()
query_vector = embedding_model.embed_query(question)

hits, _ = search_collection(ivf_col, query_vector)
context = build_context_from_hits(hits, top_k=5)

# Run the chain
response = qa_chain.invoke({"context": context, "question": question})
print("\n🔮 GPT-4o-mini Answer:\n", response.content)


🔮 GPT-4o-mini Answer:
 Llama 2 comes in several variants with different parameter sizes: 7 billion (7B), 13 billion (13B), and 70 billion (70B) parameters. Additionally, there are 34 billion (34B) variants that were trained but are not being released. Thus, the models range from 7B to 70B parameters.


In [None]:
search_with_cosine_similarity(flat_col, query_vector)
search_with_cosine_similarity(hnsw_col, query_vector)
search_with_cosine_similarity(ivf_col, query_vector)


📦 Collection: my_collection_flat
Result 1:
 - Metadata (preview): {'file_name': 'llama2.pdf', 'chunk_id': 'llama2_10', 'text': 'adopted grouped-qu...
 - Cosine Similarity: 0.6361
Result 2:
 - Metadata (preview): {'file_name': 'llama2.pdf', 'chunk_id': 'llama2_288', 'text': 'applications of L...
 - Cosine Similarity: 0.6204
Result 3:
 - Metadata (preview): {'file_name': 'llama2.pdf', 'chunk_id': 'llama2_14', 'text': 'and grouped-query ...
 - Cosine Similarity: 0.6086
Result 4:
 - Metadata (preview): {'file_name': 'llama2.pdf', 'chunk_id': 'llama2_7', 'text': '(such as BLOOM (Sca...
 - Cosine Similarity: 0.5845
Result 5:
 - Metadata (preview): {'file_name': 'llama2.pdf', 'chunk_id': 'llama2_1', 'text': 'Angela Fan Melanie ...
 - Cosine Similarity: 0.5825

📦 Collection: my_collection_hnsw
Result 1:
 - Metadata (preview): {'file_name': 'llama2.pdf', 'chunk_id': 'llama2_10', 'text': 'adopted grouped-qu...
 - Cosine Similarity: 0.6361
Result 2:
 - Metadata (preview): {'file_name': 'llama2.p

In [None]:
print("Attention is a mechanism in neural networks that allows the model to focus on specific parts of the input sequence when generating an output. It maps a query and a set of key-value pairs to an output, where the output is computed as a weighted sum of the values, with weights determined by a compatibility function comparing the query with the corresponding keys. In the \"Scaled Dot-Product Attention\" approach, the dot products of the query with all keys are computed, scaled by the dimension of the keys, and passed through a softmax function to obtain the weights on the values. This mechanism enables the model to selectively prioritize relevant information, aiding in tasks like translation, summarization, and more.")

Attention is a mechanism in neural networks that allows the model to focus on specific parts of the input sequence when generating an output. It maps a query and a set of key-value pairs to an output, where the output is computed as a weighted sum of the values, with weights determined by a compatibility function comparing the query with the corresponding keys. In the "Scaled Dot-Product Attention" approach, the dot products of the query with all keys are computed, scaled by the dimension of the keys, and passed through a softmax function to obtain the weights on the values. This mechanism enables the model to selectively prioritize relevant information, aiding in tasks like translation, summarization, and more.


In [None]:
print("Multihead Attention is a mechanism that consists of several attention layers running in parallel. It allows the model to attend to different parts of the input sequence differently, capturing various relationships and dependencies within the data. In this approach, multiple sets of queries, keys, and values are created, enabling the model to gather information from multiple perspectives at once, enhancing its ability to understand complex patterns in the input. The outputs from these multiple attention heads are then combined and processed to produce the final result.")

Multihead Attention is a mechanism that consists of several attention layers running in parallel. It allows the model to attend to different parts of the input sequence differently, capturing various relationships and dependencies within the data. In this approach, multiple sets of queries, keys, and values are created, enabling the model to gather information from multiple perspectives at once, enhancing its ability to understand complex patterns in the input. The outputs from these multiple attention heads are then combined and processed to produce the final result.
