In [1]:
# !pip install pymongo

In [2]:
from pymongo import MongoClient
import os

from sentence_transformers import SentenceTransformer
from langchain.vectorstores import MongoDBAtlasVectorSearch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
atlas_connection_string = os.getenv('ATLAS_CONNECTION_STRING')
atlas_cluster_password = os.getenv('ATLAS_CLUSTER_PASSWORD')
MONGODB_ATLAS_CLUSTER_URI = atlas_connection_string.replace("<password>", atlas_cluster_password)

# initialize MongoDB python client
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI)

In [4]:
DB_NAME = "the-communist-bot"
COLLECTION_NAME = "manifesto"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "communist-manifesto"

MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

In [5]:
def generate_embeddings(text: str) -> list[float]:
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode(text)

    return embeddings.tolist()

In [20]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = generate_embeddings(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "communist-manifesto",  # Search index name
                "queryVector": query_embedding,  # Embedding representation of the use query
                "path": "embedding",             # Document field containing the embeddings
                "numCandidates": 150,            # Number of candidate matches to consider (Limits on the number of results to return)
                "limit": 4,                      # Return top 4 matches
            }
        },
        {
            "$project": {
                "_id": 0,                                 # Exclude the _id field
                "documentID": 1,                          # Include the documentID field
                # "file_path": 1,                           # Include the file_path field
                # "file_name": 1,                           # Include the file_name field
                # "page_number": 1,
                "content": 1,
                "embedding": 1,
                "score": {"$meta": "vectorSearchScore"},  # Include the search score
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [21]:
result = vector_search("Proletarians", MONGODB_COLLECTION)

In [23]:
result[0]

{'documentID': 'f93d0913-d16b-42c4-9373-8a6094e9f8f8',
 'content': 'masters of the productive forces of society, except by abolishing their own previous mode of \nappropriation, and thereby also every other previous mode of appro priation. They have nothing of \ntheir own to secure and to fortify; their mission is to destroy all previous securities for, and insurances of, individual property.  \nAll previous historical movements were movements of minorities, or in the interest of minor ities. \nThe proletarian movement is the self -conscious, independent movement of the immense majority, \nin the interest of the immense majority. The proletariat, the lowest stratum of our present society, \ncannot stir, cannot raise itself up, without the whole  superincumbent strata of official society \nbeing sprung into the air.  \nThough not in substance, yet in form, the struggle of the proletariat with the bourgeoisie is at first \na national struggle. The proletariat of each country must, of cou