# Embeddings with Sentence Transformers

Install any packages you don't already have.

In [None]:
!pip install sentence_transformers
!pip install pymongo

In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import pandas as pd
from pymongo import MongoClient
from nltk.tokenize import sent_tokenize


Import a semantic sentence model, against which you will calculate embeddings. There are MANY to choose from. We'll start with a lightweight, all-purpose model suggested by our reading, `all-MiniLM-L6-v2`. I've also preloaded code here for a much more robust multilingual model that I like to use for narrative discovery and translation tasks, `LaBSE`. But there are many more out there...

In [None]:
# a good, lightweight model
model = SentenceTransformer('all-MiniLM-L6-v2')

# a better but much larger model, LaBSE
# model = SentenceTransformer('sentence-transformers/LaBSE')

Let's read in some documents and generate embeddings for them that align with the chosen model.

In [None]:
with open('turing.txt') as f:
    text = f.read()

en_sents = sent_tokenize(text.replace('\t', ' ').replace('\n', ' ').replace('“', '"').replace('”', '"').replace('. . . .', '....').replace(' . . .', '...').replace('. . .', '...'))
en_sents = [sent.replace('\n', ' ') for sent in en_sents]

with open('turing_sentences.txt', 'w') as f:
    for sent in en_sents:
        f.write(sent + '\n')

In [None]:
# read in some documents from facts.txt
with open('turing_sentences.txt', 'r') as f:
    documents = f.readlines()
    documents = [x.strip() for x in documents]

In [None]:
len(documents)

In [None]:
embeddings = model.encode(documents)

In [None]:
# print the number of dimensions in these embeddings
print(len(embeddings[0]))

# print the embedding of the first sentence
embeddings[0]

In [None]:
# show all of the embeddings
embeddings

Now let's use MongoDB's native vector search capabilities. As of September 2025, MongoDB Community Edition supports vector search locally!

In [None]:
# Connect to MongoDB (local installation)
client = MongoClient('mongodb://localhost:50085/?directConnection=true')
db = client['turing']
collection = db['sentences']

# Clear any existing documents
collection.delete_many({})
print("Connected to MongoDB")

In [None]:
# Store documents and embeddings in MongoDB
docs_to_insert = []
for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
    docs_to_insert.append({
        'text': doc,
        'embedding': embedding.tolist()
    })

result = collection.insert_many(docs_to_insert)
print(f"Inserted {len(result.inserted_ids)} documents into MongoDB")

In [None]:
collection.find_one()

In [None]:
# Create a vector search index
# This enables efficient vector similarity search in MongoDB
try:
    collection.create_search_index(
        model={
            "definition": {
                "mappings": {
                    "dynamic": True,
                    "fields": {
                        "embedding": {
                            "type": "vector",
                            "numDimensions": 384,
                            "similarity": "cosine"
                        }
                    }
                }
            },
            "name": "vector_index"
        }
    )
    print("Vector search index created successfully")
except Exception as e:
    print(f"Index creation note: {e}")
    print("Index may already exist or require MongoDB 7.0+ with vector search enabled")

Now let's ask a question of this database.

In [None]:
# Try another question
query = 'Who was Alan Turing?'
query_embedding = model.encode(query)

pipeline = [
    {
        "$vectorSearch": {
            "index": "vector_index",
            "path": "embedding",
            "queryVector": query_embedding.tolist(),
            "numCandidates": 100,
            "limit": 5
        }
    },
    {
        "$project": {
            "_id": 0,
            "text": 1,
            "score": {"$meta": "vectorSearchScore"}
        }
    }
]

results = list(collection.aggregate(pipeline))

for result in results:
    print(result['text'])