## Load the SentenceTransformer and test the embedding is working


In [11]:
from sentence_transformers import SentenceTransformer

embeddingMModel = SentenceTransformer('all-MiniLM-L6-v2')

# def embed_text(text):
#     return embeddingModel.encode(text)
# # Example usage
# if __name__ == "__main__":
#     sample_text = "This is a sample text for embedding."
#     embedding = embed_text(sample_text)
#     print(f"Embedding for the sample text: {embedding[:5]}")

### Load Document and spilt into chunks and then embedding the chunks

In [15]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# load the pdf document
loader = PyPDFLoader("dataset/AI Agents guidebook.pdf")
documents = loader.load()

# split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
print(f"Number of chunks created: {len(chunks)}")

Ignoring wrong pointing object 899 0 (offset 0)


Number of chunks created: 149


### Embed the chunks 

In [22]:
encoded_chunks = [embeddingMModel.encode(chunk.page_content) for chunk in chunks]
print(f"Embedding for first chunk: {encoded_chunks[0][:5]}")
print(f'Total number of encoded chunks: {len(encoded_chunks)}')
print(f'Length of each encoded chunk: {len(encoded_chunks[1])}')
print(f'Embedding Model Shape: {encoded_chunks[0].shape}')
# Example usage
# if __name__ == "__main__":
#     print(f"Number of chunks created: {len(chunks)}")
#     print(f"Embedding for first chunk: {encoded_chunks[0][:5]}")    

Embedding for first chunk: [-0.05577217 -0.04005972  0.02752373  0.03279826  0.05366068]
Total number of encoded chunks: 149
Length of each encoded chunk: 384
Embedding Model Shape: (384,)


### Save the embedding in Chromdb

In [26]:
import chromadb

# Create a ChromaDB client
client = chromadb.Client()
# Create a collection to store the embeddings
collection = client.get_or_create_collection(name="ai_agents_guidebook")
# Add the chunks , their embeddings and metadata to the collection
for i, chunk in enumerate(chunks):
    collection.add(
        documents=[chunk.page_content],
        embeddings=[encoded_chunks[i]],
        ids=[str(i)],
        metadatas=[{"source": "AI Agents guidebook", "chunk_index": i}]
    )
print(f"Embeddings saved to ChromaDB collection 'ai_agents_guidebook'. {collection.count()} items added.")
print(f"Number of chunks created: {collection}")

Add of existing embedding ID: 0
Insert of existing embedding ID: 0
Add of existing embedding ID: 1
Insert of existing embedding ID: 1
Add of existing embedding ID: 2
Insert of existing embedding ID: 2
Add of existing embedding ID: 3
Insert of existing embedding ID: 3
Add of existing embedding ID: 4
Insert of existing embedding ID: 4
Add of existing embedding ID: 5
Insert of existing embedding ID: 5
Add of existing embedding ID: 6
Insert of existing embedding ID: 6
Add of existing embedding ID: 7
Insert of existing embedding ID: 7
Add of existing embedding ID: 8
Insert of existing embedding ID: 8
Add of existing embedding ID: 9
Insert of existing embedding ID: 9
Add of existing embedding ID: 10
Insert of existing embedding ID: 10
Add of existing embedding ID: 11
Insert of existing embedding ID: 11
Add of existing embedding ID: 12
Insert of existing embedding ID: 12
Add of existing embedding ID: 13
Insert of existing embedding ID: 13
Add of existing embedding ID: 14
Insert of existing em

Embeddings saved to ChromaDB collection 'ai_agents_guidebook'. 149 items added.
Number of chunks created: Collection(name=ai_agents_guidebook)


### Query and retrieve the vector db here. it is chromadb... Testing done before using LLM


In [31]:
# create a method which will take query and return the relevant chunks
def query_chromadb(query, top_k=3):
    # Embed the query using the same embedding model
    query_embedding = embeddingMModel.encode(query)
    # Query the collection for similar embeddings
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    return results

       
# Test the method using Query list before using LLM
query_list = [
    "What is MCP Server",
    "What is Kayak Tool?",
    "Can i work from home today?",
    "How to use AutoGPT?",
]



# Example usage with the query list
if __name__ == "__main__":
    print("=" * 60)
    print("üîç TESTING SEMANTIC SEARCH")
    print("=" * 60)
    for query in query_list:
        print(f"Results for query: {query}")
        results = query_chromadb(query, top_k=3)
        for i, (doc, score, metadata) in enumerate(zip(results['documents'][0], results['distances'][0], results['metadatas'][0])):
            print(f"\n  Result {i+1}:")
            print(f"  üìÑ Topic: {metadata['source']}")
            print(f"  üìè Distance: {score:.4f} (lower = more similar)")
            print(f"  üìñ Text: {doc[:50]}...")

üîç TESTING SEMANTIC SEARCH
Results for query: What is MCP Server

  Result 1:
  üìÑ Topic: AI Agents guidebook
  üìè Distance: 0.8619 (lower = more similar)
  üìñ Text: DailyDoseofDS.com 
Finally, once we have all the a...

  Result 2:
  üìÑ Topic: AI Agents guidebook
  üìè Distance: 0.9035 (lower = more similar)
  üìñ Text: DailyDoseofDS.com 
 
#8) Integrate MCP server with...

  Result 3:
  üìÑ Topic: AI Agents guidebook
  üìè Distance: 0.9076 (lower = more similar)
  üìñ Text: DailyDoseofDS.com 
#7) Integrate MCP server with C...
Results for query: What is Kayak Tool?

  Result 1:
  üìÑ Topic: AI Agents guidebook
  üìè Distance: 0.5971 (lower = more similar)
  üìñ Text: DailyDoseofDS.com 
 
#4) Kayak tool 
A custom Kaya...

  Result 2:
  üìÑ Topic: AI Agents guidebook
  üìè Distance: 0.6021 (lower = more similar)
  üìñ Text: DailyDoseofDS.com 
 
#4) Kayak tool 
A custom Kaya...

  Result 3:
  üìÑ Topic: AI Agents guidebook
  üìè Distance: 1.0708 (lower = more simi