# Product documentation → vector embeddings → MongoDB Atlas

This notebook loads the product documentation PDF, chunks it, generates embeddings with a **free local model** (sentence-transformers), and stores documents + embeddings in MongoDB Atlas for vector search. No OpenAI or other API key required for embeddings.

In [6]:
import os
import re
import hashlib
import time
from datetime import datetime, timezone
from pathlib import Path

from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from pymongo.operations import ReplaceOne

load_dotenv()

MONGO_URI = os.getenv("MONGO_URI")
PDF_PATH = Path("PRODUCT_DOCUMENTATION.pdf")
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # 384 dimensions
DOC_DB, DOC_COLL = "dhsync", "product_docs"

# Vector index: must match embedding dimensions
VECTOR_INDEX_NAME = "vector_index"
EMBEDDING_DIMENSIONS = 384

assert MONGO_URI, "Set MONGO_URI in .env"
assert PDF_PATH.exists(), f"PDF not found: {PDF_PATH}"


def normalize_text(text: str) -> str:
    """Clean PDF text: broken lines, hyphenation, hidden chars, extra whitespace."""
    if not text or not text.strip():
        return ""
    # Replace hyphen-newline with nothing (join hyphenated words)
    t = re.sub(r"-\s*\n\s*", "", text)
    # Normalize whitespace (including unicode) to single space
    t = re.sub(r"[\s\u00a0]+", " ", t)
    # Strip control characters
    t = "".join(c for c in t if c.isprintable() or c in "\n\t")
    return t.strip()


def content_hash(*parts: str) -> str:
    """Stable id for dedup/upsert."""
    return hashlib.sha256("|".join(parts).encode("utf-8")).hexdigest()[:24]

In [7]:
# Load PDF and split into chunks (raw text; we normalize when embedding/storing)
loader = PyPDFLoader(str(PDF_PATH))
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=120,
    length_function=len,
    separators=["\n## ", "\n\n", "\n", ". ", " ", ""],
)
chunks = text_splitter.split_documents(documents)
print(f"Loaded {len(documents)} page(s), split into {len(chunks)} chunks.")

Loaded 22 page(s), split into 93 chunks.


In [8]:
# Generate embeddings and upsert into MongoDB (no duplicates; safe to re-run)
def embed_with_retry(embeddings, texts, batch_size=32, max_retries=3):
    """Embed in batches with retries for transient failures."""
    all_vectors = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        for attempt in range(max_retries):
            try:
                all_vectors.extend(embeddings.embed_documents(batch))
                break
            except Exception as e:
                if attempt == max_retries - 1:
                    raise
                time.sleep(2 ** attempt)
    return all_vectors

# Define doc_id once per document (required for this cell; does not depend on cell 2)
doc_id = content_hash(str(PDF_PATH.resolve()))

print(f"Loading embedding model: {EMBEDDING_MODEL} (first run may download ~80MB)...")
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

# Embed normalized text for better quality; store cleaned + optional raw
texts = [normalize_text(chunk.page_content) for chunk in chunks]
try:
    all_vectors = embed_with_retry(embeddings, texts)
except Exception as e:
    print(f"Embedding failed: {e}")
    raise

client = MongoClient(MONGO_URI, server_api=ServerApi("1"), serverSelectionTimeoutMS=10000)
db = client[DOC_DB]
collection = db[DOC_COLL]

now = datetime.now(timezone.utc)
operations = []
for position, (chunk, vec) in enumerate(zip(chunks, all_vectors)):
    source = chunk.metadata.get("source", "")
    page = chunk.metadata.get("page")
    text_clean = texts[position]
    chunk_id = content_hash(text_clean, source, str(page))
    doc = {
        "_id": chunk_id,
        "chunk_id": chunk_id,
        "doc_id": doc_id,
        "position": position,
        "text": text_clean,
        "text_raw": chunk.page_content,
        "text_length": len(text_clean),
        "embedding": vec,
        "embedding_model": EMBEDDING_MODEL,
        "created_at": now,
        "metadata": {"source": source, "page": page},
    }
    operations.append(ReplaceOne({"_id": chunk_id}, doc, upsert=True))

# Upsert with retry for network/Atlas errors
for attempt in range(3):
    try:
        if operations:
            result = collection.bulk_write(operations, ordered=False)
            print(f"Upserted {result.upserted_count + result.modified_count} chunks (inserted: {result.upserted_count}, updated: {result.modified_count}).")
        else:
            print("No chunks to write.")
        break
    except Exception as e:
        if attempt == 2:
            print(f"Write failed after retries: {e}")
            raise
        time.sleep(2 ** attempt)

Loading embedding model: sentence-transformers/all-MiniLM-L6-v2 (first run may download ~80MB)...


  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 753.71it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Upserted 93 chunks (inserted: 93, updated: 0).


In [14]:
# Create the vector search index in Atlas (required for $vectorSearch)
# Option A: Automatically via Atlas Admin API (set in .env: ATLAS_GROUP_ID, ATLAS_CLUSTER_NAME, ATLAS_PUBLIC_KEY, ATLAS_PRIVATE_KEY)
# Option B: Manually in Atlas UI — see the markdown cell below

import json
import os

ATLAS_GROUP_ID = os.getenv("ATLAS_GROUP_ID")
ATLAS_CLUSTER_NAME = os.getenv("ATLAS_CLUSTER_NAME", "Cluster0")
ATLAS_PUBLIC_KEY = os.getenv("ATLAS_PUBLIC_KEY")
ATLAS_PRIVATE_KEY = os.getenv("ATLAS_PRIVATE_KEY")

# Vector Search index (for $vectorSearch stage): definition uses top-level "fields" array
index_definition = {
    "fields": [
        {
            "type": "vector",
            "path": "embedding",
            "numDimensions": EMBEDDING_DIMENSIONS,
            "similarity": "cosine",
        }
    ]
}
payload = {
    "name": VECTOR_INDEX_NAME,
    "type": "vectorSearch",
    "database": DOC_DB,
    "collectionName": DOC_COLL,
    "definition": index_definition,
}

if ATLAS_GROUP_ID and ATLAS_PUBLIC_KEY and ATLAS_PRIVATE_KEY:
    import requests
    from requests.auth import HTTPDigestAuth

    base_url = f"https://cloud.mongodb.com/api/atlas/v2/groups/{ATLAS_GROUP_ID}/clusters/{ATLAS_CLUSTER_NAME}/search/indexes"
    auth = HTTPDigestAuth(ATLAS_PUBLIC_KEY, ATLAS_PRIVATE_KEY)
    headers = {"Content-Type": "application/json", "Accept": "application/vnd.atlas.2024-05-30+json"}
    try:
        # Idempotent: list existing indexes and skip if already present
        list_r = requests.get(base_url, auth=auth, headers=headers, timeout=30)
        data = list_r.json() if list_r.ok else None
        index_list = data if isinstance(data, list) else (data.get("indexes", []) if isinstance(data, dict) else [])
        existing = {idx.get("name") for idx in index_list if isinstance(idx, dict)}
        if VECTOR_INDEX_NAME in existing:
            print("An index named 'vector_index' already exists. If it is type 'Search' (not Vector Search), delete it in Atlas and re-run this cell to create a Vector Search index.")
        else:
            r = requests.post(base_url, auth=auth, headers=headers, json=payload, timeout=30)
            if r.status_code in (200, 201):
                print("Vector Search index created. Status:", r.json().get("status", "OK"))
            else:
                print("Index creation returned:", r.status_code, r.text[:500])
                if r.status_code == 400:
                    print("Create the Vector Search index manually in Atlas UI (see markdown cell below).")
    except Exception as e:
        print("Atlas API error:", e)
else:
    print("Set ATLAS_GROUP_ID, ATLAS_PUBLIC_KEY, ATLAS_PRIVATE_KEY (and optionally ATLAS_CLUSTER_NAME) in .env to create the index automatically.")
    print("Otherwise create it in the Atlas UI using the definition below (next cell).")

Vector search index already exists; skipping create.


## Vector Search index (manual fallback)

**You must create a Vector Search index** (not a plain Atlas Search index) so `$vectorSearch` works.

1. **Atlas** → **Database** → your cluster → **Search** tab.
2. If an index named `vector_index` already exists and its **Type** is "Search", delete it (it is the wrong type).
3. **Create Search Index** → **JSON Editor**.
4. Database: `dhsync`, collection: `product_docs`, index name: `vector_index`.
5. **Index type:** ensure you are creating a **Vector Search** index. Use this definition (top-level `fields` array, path `embedding`):

```json
{
  "fields": [
    {
      "type": "vector",
      "path": "embedding",
      "numDimensions": 384,
      "similarity": "cosine"
    }
  ]
}
```

6. Wait until status is **READY**. Then run the test query cell.

**If your index failed with "numDimensions is required":** You used `dimensions` but Atlas expects **`numDimensions`**. Edit the index in Atlas and set the embedding field to `"numDimensions": 384` (not `"dimensions"`). If using mappings format, use:
```json
"embedding": { "type": "vector", "numDimensions": 384, "similarity": "cosine" }
```

## Short diagnosis (run before test query)

Run the cell below to verify: documents have `embedding` (384 dims), connection target, and that you need a **Vector Search** index (not Atlas Search) for `$vectorSearch`.

In [31]:
# Short diagnosis: verify data and connection (run Cell 1 first)
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

client = MongoClient(MONGO_URI, server_api=ServerApi("1"))
db = client[DOC_DB]
collection = db[DOC_COLL]

print("1. Connection target:", DOC_DB, ".", DOC_COLL)
count = collection.count_documents({})
print("2. Document count:", count)

doc = collection.find_one({"embedding": {"$exists": True}})
if doc:
    dims = len(doc["embedding"])
    print("3. Sample doc has 'embedding' with", dims, "dimensions (expected 384)")
    if dims != EMBEDDING_DIMENSIONS:
        print("   WARNING: dimension mismatch — index and query must use", dims)
else:
    print("3. No document with 'embedding' found — run ingest cells first.")

print("4. For $vectorSearch you need a Vector Search index (type: Vector Search in Atlas),")
print("   with definition: fields array, path 'embedding', numDimensions", EMBEDDING_DIMENSIONS)

1. Connection target: dhsync . product_docs
2. Document count: 93
3. Sample doc has 'embedding' with 384 dimensions (expected 384)
4. For $vectorSearch you need a Vector Search index (type: Vector Search in Atlas),
   with definition: fields array, path 'embedding', numDimensions 384


## Test: find relevant chunks for a question

Run the cell below to query the vector index with a test question and see the top matching chunks from your product docs.

In [32]:
# Test query: embed a question and find the most relevant chunks (run Cell 1 first)
from langchain_community.embeddings import HuggingFaceEmbeddings
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

test_question = "how to set office status"  # change this to any question

client = MongoClient(MONGO_URI, server_api=ServerApi("1"))
db = client[DOC_DB]
collection = db[DOC_COLL]

# Sanity check: ensure we have docs and the index can be used
count = collection.count_documents({})
print(f"Collection {DOC_DB}.{DOC_COLL} has {count} documents.")
if count == 0:
    print("Run the ingest cells (load PDF + embed + upsert) first, then ensure the vector index is READY in Atlas.")
else:
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    query_vector = embeddings.embed_query(test_question)

    # Use $search with vectorSearch operator (Atlas Search index with mappings), not $vectorSearch stage
    pipeline = [
    {
        "$vectorSearch": {
            "index": VECTOR_INDEX_NAME,
            "path": "embedding",
            "queryVector": query_vector,
            "numCandidates": 50,
            "limit": 5,
        }
    },
    {
        "$project": {
            "text": 1,
            "metadata": 1,
            "score": {"$meta": "vectorSearchScore"},
        }
    },
    ]

    try:
        results = list(collection.aggregate(pipeline))
    except Exception as e:
        print(f"Vector search failed: {e}")
        print("Check in Atlas: index exists, name 'vector_index', status READY, path 'embedding'.")
        results = []

    print(f"\nQuestion: {test_question}\n")
    print(f"Top {len(results)} relevant chunks:\n")
    if not results:
        print("No results. If the collection has documents, check: index name, index status READY, and path 'embedding'.")
    for i, doc in enumerate(results, 1):
        score = doc.get("score")
        meta = doc.get("metadata") or {}
        text = doc.get("text") or ""
        print(f"--- Result {i} (score: {score:.4f}, page: {meta.get('page', '?')}) ---")
        print(text[:500] + ("..." if len(text) > 500 else ""))
        print()

Collection dhsync.product_docs has 93 documents.


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1154.49it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



Question: how to set office status

Top 5 relevant chunks:

--- Result 1 (score: 0.6988, page: 1) ---
windows to their entries. • Use bulk operations to set status across multiple dates at once. • Create, manage, and apply personal templates for quick status entry. • Use advanced features such as Repeat Pattern, Copy From Date, and Copy Week/Month. • View the Today’s Status widget showing who is in the oﬀice, on leave, or working from home today. • Search and filter the team calendar by name, email, or status. • Change their own display name and password through the Profile page. Members are res...

--- Result 2 (score: 0.6864, page: 19) ---
a duplicate, due to the upsert behavior used by the system. Status V alues The only valid status values that can be stored in the database are “oﬀice” and “leave. ” Attempting to set any other value through the API results in a validation error. The “clear” status used in bulk operations is a client-side concept that translates to deleting the ent

In [33]:
# Test query: embed a question and find the most relevant chunks (run Cell 1 first)
from langchain_community.embeddings import HuggingFaceEmbeddings
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

test_question = "how to set office status"  # change this to any question

client = MongoClient(MONGO_URI, server_api=ServerApi("1"))
db = client[DOC_DB]
collection = db[DOC_COLL]

# Sanity check: ensure we have docs and the index can be used
count = collection.count_documents({})
print(f"Collection {DOC_DB}.{DOC_COLL} has {count} documents.")
if count == 0:
    print("Run the ingest cells (load PDF + embed + upsert) first, then ensure the vector index is READY in Atlas.")
else:
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    query_vector = embeddings.embed_query(test_question)

    # Use $search with vectorSearch operator (Atlas Search index with mappings), not $vectorSearch stage
    pipeline = [
    {
        "$vectorSearch": {
            "index": VECTOR_INDEX_NAME,
            "path": "embedding",
            "queryVector": query_vector,
            "numCandidates": 50,
            "limit": 5,
        }
    },
    {
        "$project": {
            "text": 1,
            "metadata": 1,
            "score": {"$meta": "vectorSearchScore"},
        }
    },
    ]

    try:
        results = list(collection.aggregate(pipeline))
    except Exception as e:
        print(f"Vector search failed: {e}")
        print("Check in Atlas: index exists, name 'vector_index', status READY, path 'embedding'.")
        results = []

    print(f"\nQuestion: {test_question}\n")
    print(f"Top {len(results)} relevant chunks:\n")
    if not results:
        print("No results. If the collection has documents, check: index name, index status READY, and path 'embedding'.")
    for i, doc in enumerate(results, 1):
        score = doc.get("score")
        meta = doc.get("metadata") or {}
        text = doc.get("text") or ""
        print(f"--- Result {i} (score: {score:.4f}, page: {meta.get('page', '?')}) ---")
        print(text[:500] + ("..." if len(text) > 500 else ""))
        print()

Collection dhsync.product_docs has 93 documents.


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1659.87it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



Question: how to set office status

Top 5 relevant chunks:

--- Result 1 (score: 0.6988, page: 1) ---
windows to their entries. • Use bulk operations to set status across multiple dates at once. • Create, manage, and apply personal templates for quick status entry. • Use advanced features such as Repeat Pattern, Copy From Date, and Copy Week/Month. • View the Today’s Status widget showing who is in the oﬀice, on leave, or working from home today. • Search and filter the team calendar by name, email, or status. • Change their own display name and password through the Profile page. Members are res...

--- Result 2 (score: 0.6864, page: 19) ---
a duplicate, due to the upsert behavior used by the system. Status V alues The only valid status values that can be stored in the database are “oﬀice” and “leave. ” Attempting to set any other value through the API results in a validation error. The “clear” status used in bulk operations is a client-side concept that translates to deleting the ent