# Embeddings with Sentence Transformers

Install any packages you don't already have.

In [None]:
!pip install sentence_transformers
!pip install chromadb

In [None]:
from sentence_transformers import SentenceTransformer
from chromadb import Client, Settings
from sentence_transformers.util import cos_sim
import pandas as pd
import chromadb
from pymongo import MongoClient

Import a semantic sentence model, against which you will calculate embeddings. There are MANY to choose from. We'll start with a lightweight, all-purpose model suggested by our reading, `all-MiniLM-L6-v2`. I've also preloaded code here for a much more robust multilingual model that I like to use for narrative discovery and translation tasks, `LaBSE`. But there are many more out there...

In [None]:
# a good, lightweight model
model = SentenceTransformer('all-MiniLM-L6-v2')

# a better but much larger model, LaBSE
# model = SentenceTransformer('sentence-transformers/LaBSE')

Let's read in some documents and generate embeddings for them that align with the chosen model.

In [None]:
# read in some documents from facts.txt
with open('facts.txt', 'r') as f:
    documents = f.readlines()
    documents = [x.strip() for x in documents]

In [None]:
documents[:10]

In [None]:
embeddings = model.encode(documents)

In [None]:
# print the number of dimensions in these embeddings
print(len(embeddings[0]))

# print the embedding of the first sentence
embeddings[0]

In [None]:
# show all of the embeddings
embeddings

We'll integrate with MongoDB later, but for now, let's use a simple, in-memory vector database, ChromaDB.

In [None]:
# initialize a ChromaDB client and collection
chroma_client = Client(Settings(is_persistent = False))
collection = chroma_client.create_collection(name = 'docs')

In [None]:
# Store documents and embeddings in ChromaDB
collection.add(embeddings = [e.tolist() for e in embeddings],
               documents = documents,
               ids = [f"doc_{i}" for i in range(len(documents))])

Now let's ask a question of this database.

In [None]:
query = 'What is the capital of France?'
query_embedding = model.encode(query)

print(query_embedding)

In [None]:
# search for similar documents
# (ChromaDB uses cosine similarity by default)
results = collection.query(query_embeddings = [query_embedding.tolist()],
                           n_results = 1)

for doc in results['documents'][0]:
    print(doc)

Try again, but ask it to return more results.

In [None]:
# search for similar documents
results = collection.query(query_embeddings = [query_embedding.tolist()],
                           n_results = 5)

for doc in results['documents'][0]:
    print(doc)

Let's peer under the hood a bit and see what's happening...

In [None]:
# cosine similarity of the question's embedding and the top answer's embedding
cos_sim(query_embedding,
        model.encode(results['documents'][0][0]))

In [None]:
# make a dataframe of the top five results
results_list = []

for doc in results['documents'][0]:
    results_list.append([query, doc, float(cos_sim(query_embedding,
                                                   model.encode(doc)))])

results_df = pd.DataFrame(results_list)
results_df

In [None]:
# try another question
query = 'Who is the president of the United States?'
query_embedding = model.encode(query)
results = collection.query(query_embeddings = [query_embedding.tolist()],
                           n_results = 1)

for doc in results['documents'][0]:
    print(doc)

# MongoDB Vector Functionality

Now let's use MongoDB's native vector search capabilities. As of September 2025, MongoDB Community Edition supports vector search locally!

First, we need to [install a couple of things](https://www.mongodb.com/docs/atlas/cli/current/atlas-cli-deploy-local/).

- AtlasCLI (This may make you setup an account on their Atlas platform and/or prompt you to setup a database in their cloud service, but you don't have to! Once you have this installed, you can create the database locally.)
- Docker (This needs to be on and running in the background, but otherwise you don't have to touch it.)

Once both are installed, run `atlas setup` followed by `atlas deployments setup` to get going.

NOTE: This will be a separate MongoDB instance on your machine from what we've already been working with,and it will give you a different port number to connect to.

In [None]:
# Connect to MongoDB (local ATLAS installation)
client = MongoClient('mongodb://localhost:55784/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+2.5.9') # use the connection info Atlas gives you.
db = client['vector_search_demo']
collection = db['documents']

# Clear any existing documents
# collection.delete_many({})
print("Connected to MongoDB")

In [None]:
# Store documents and embeddings in MongoDB
docs_to_insert = []
for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
    docs_to_insert.append({
        'text': doc,
        'embedding': embedding.tolist()
    })

result = collection.insert_many(docs_to_insert)
print(f"Inserted {len(result.inserted_ids)} documents into MongoDB")

In [None]:
collection.find_one()

In [None]:
# Create a vector search index
# This enables efficient vector similarity search in MongoDB
try:
    collection.create_search_index(
        model={
            "definition": {
                "mappings": {
                    "dynamic": True,
                    "fields": {
                        "embedding": {
                            "type": "vector",
                            "numDimensions": 384,
                            "similarity": "cosine"
                        }
                    }
                }
            },
            "name": "vector_index"
        }
    )
    print("Vector search index created successfully")
except Exception as e:
    print(f"Index creation note: {e}")
    print("Index may already exist or require MongoDB 7.0+ with vector search enabled")

Now let's ask a question of this database.

In [None]:
query = 'What is the capital of France?'
query_embedding = model.encode(query)

print(query_embedding)

In [None]:
# Perform vector search using MongoDB's $vectorSearch aggregation stage
pipeline = [
    {
        "$vectorSearch": {
            "index": "vector_index",
            "path": "embedding",
            "queryVector": query_embedding.tolist(),
            "numCandidates": 100,
            "limit": 1
        }
    },
    {
        "$project": {
            "_id": 0,
            "text": 1,
            "score": {"$meta": "vectorSearchScore"}
        }
    }
]

results = list(collection.aggregate(pipeline))

for result in results:
    print(result['text'])

Try again, but ask it to return more results.

In [None]:
# Search for top 5 similar documents
pipeline = [
    {
        "$vectorSearch": {
            "index": "vector_index",
            "path": "embedding",
            "queryVector": query_embedding.tolist(),
            "numCandidates": 100,
            "limit": 5
        }
    },
    {
        "$project": {
            "_id": 0,
            "text": 1,
            "score": {"$meta": "vectorSearchScore"}
        }
    }
]

results = list(collection.aggregate(pipeline))

for result in results:
    print(result['text'])

Let's peer under the hood a bit and see what's happening...

In [None]:
# Try another question
query = 'Who is the president of the United States?'
# query = 'Does the earth orbit around Mars?'
# query = 'How many sides does a pentagon have?'
# query = 'How many sides does a nonagon have?'
# query = 'What is hockey?'
query_embedding = model.encode(query)

pipeline = [
    {
        "$vectorSearch": {
            "index": "vector_index",
            "path": "embedding",
            "queryVector": query_embedding.tolist(),
            "numCandidates": 100,
            "limit": 1
        }
    },
    {
        "$project": {
            "_id": 0,
            "text": 1,
            "score": {"$meta": "vectorSearchScore"}
        }
    }
]

results = list(collection.aggregate(pipeline))

for result in results:
    print(result['text'])