In [58]:
import os
import time
import pprint
import pymongo
from dotenv import load_dotenv
from pymongo.operations import SearchIndexModel
from openai import OpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_transformers.openai_functions import create_metadata_tagger
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import PromptTemplate

In [2]:
load_dotenv(".env")

True

### Text embeddings

In [3]:
openai = OpenAI()

In [4]:
EMBEDDING_MODEL = "text-embedding-ada-002"

In [5]:
vector = openai.embeddings.create(
    model=EMBEDDING_MODEL,
    input="The food was delicious and the waiter..."
)

In [6]:
len(vector.data[0].embedding)

1536

In [7]:
def get_embedding(query_str):
    vector = openai.embeddings.create(model=EMBEDDING_MODEL, input=query_str)
    return vector.data[0].embedding

In [8]:
MONGODB_URI = os.getenv("MONGODB_URI")

### Creating a vector search index

In [9]:
client = pymongo.MongoClient(MONGODB_URI)

In [10]:
db = client.sample_mflix

In [11]:
embedded_movies_collection = db.embedded_movies

In [12]:
vector_search_index_model = SearchIndexModel(
    name="plot_embedding_idx",
    type="vectorSearch",
    definition={
        "fields": [
            {
                "type": "vector",
                "path": "plot_embedding",
                "numDimensions": 1536,
                "similarity": "dotProduct",
                "quantization": "scalar"
            }
        ]
    }
)

In [13]:
result = embedded_movies_collection.create_search_index(model=vector_search_index_model)

OperationFailure: An index named "plot_embedding_idx" is already defined for collection embedded_movies. Index names must be unique for a source collection and all its views., full error: {'ok': 0.0, 'errmsg': 'An index named "plot_embedding_idx" is already defined for collection embedded_movies. Index names must be unique for a source collection and all its views.', 'code': 68, 'codeName': 'IndexAlreadyExists', '$clusterTime': {'clusterTime': Timestamp(1740934367, 9), 'signature': {'hash': b'\x96I\x87~\xe8\xaf\x1av\x17"\xd7\x88\xfb\xc3\xac\x8c\xe75\x10\xfc', 'keyId': 7431213296700620803}}, 'operationTime': Timestamp(1740934367, 9)}

In [None]:
print("New search index named " + result + " is building.")

In [None]:
print("Polling to check if the index is ready. This may take up to a minute.")
predicate = None
if predicate is None:
    predicate = lambda index: index.get("queryable") is True

while True:
    indices = list(embedded_movies_collection.list_search_indexes(result))
    if len(indices) and predicate(indices[0]):
        break
    time.sleep(5)
print(result + " is ready for querying.")

### Editing a vector search index

In [None]:
new_definition = {
    "fields": [
        {
            "type": "vector",
            "path": "plot_embedding",
            "numDimensions": 1536,
            "similarity": "dotProduct",
            "quantization": "scalar"
        },
        {
            "type": "filter",
            "path": "genres"
        },
        {
            "type": "filter",
            "path": "year"
        }
    ]
}

In [None]:
embedded_movies_collection.update_search_index("plot_embedding_idx", new_definition)

### Creating a vector query for semantic search

In [14]:
query = "A movie about people who are trying to escape from a maximum security facility."

embedding = get_embedding(query)

In [15]:
vector_search = {
    "$vectorSearch": {
        "index": "plot_embedding_idx",
        "path": "plot_embedding",
        "queryVector": embedding,
        "numCandidates": 100, # good practice: 10*limit
        "limit": 10
    }
}

select_fields = {
    "$project": {
        "title": 1,
        "plot": 1,
        "score": {"$meta": "vectorSearchScore"}
    }
}

pipeline = [vector_search, select_fields]

In [16]:
cursor = embedded_movies_collection.aggregate(pipeline)

In [17]:
for doc in cursor:
    pprint.pprint(doc)

{'_id': ObjectId('573a13c1f29313caabd649a8'),
 'plot': 'When a structural-security authority finds himself set up and '
         "incarcerated in the world's most secret and secure prison, he has to "
         'use his skills to escape with help from the inside.',
 'score': 0.9361114501953125,
 'title': 'Escape Plan'}
{'_id': ObjectId('573a1399f29313caabcedb10'),
 'plot': 'A futuristic prison movie. Protagonist and wife are nabbed at a '
         'future US emigration point with an illegal baby during population '
         'control. The resulting prison experience is the subject of ...',
 'score': 0.9358062744140625,
 'title': 'Fortress'}
{'_id': ObjectId('573a13c2f29313caabd66751'),
 'plot': 'The story of two men on different sides of a prison riot -- the '
         'inmate leading the rebellion and the young guard trapped in the '
         'revolt, who poses as a prisoner in a desperate attempt to survive '
         'the ordeal.',
 'score': 0.9333038330078125,
 'title': 'Cell 211'}
{

In [18]:
vector_search_with_filter = {
    "$vectorSearch": {
        "index": "plot_embedding_idx",
        "path": "plot_embedding",
        "queryVector": embedding,
        "numCandidates": 100, # good practice: 10*limit
        "limit": 10,
        "filter": {"year": {"$gte": 2000}}
    }
}

select_fields = {
    "$project": {
        "title": 1,
        "plot": 1,
        "year": 1,
        "score": {"$meta": "vectorSearchScore"}
    }
}

pipeline = [vector_search, select_fields]

In [19]:
cursor = embedded_movies_collection.aggregate(pipeline)

In [20]:
for doc in cursor:
    pprint.pprint(doc)

{'_id': ObjectId('573a13c1f29313caabd649a8'),
 'plot': 'When a structural-security authority finds himself set up and '
         "incarcerated in the world's most secret and secure prison, he has to "
         'use his skills to escape with help from the inside.',
 'score': 0.9361114501953125,
 'title': 'Escape Plan',
 'year': 2013}
{'_id': ObjectId('573a1399f29313caabcedb10'),
 'plot': 'A futuristic prison movie. Protagonist and wife are nabbed at a '
         'future US emigration point with an illegal baby during population '
         'control. The resulting prison experience is the subject of ...',
 'score': 0.9358062744140625,
 'title': 'Fortress',
 'year': 1992}
{'_id': ObjectId('573a13c2f29313caabd66751'),
 'plot': 'The story of two men on different sides of a prison riot -- the '
         'inmate leading the rebellion and the young guard trapped in the '
         'revolt, who poses as a prisoner in a desperate attempt to survive '
         'the ordeal.',
 'score': 0.93330383300

### Vector search for RAG applications

In [45]:
loader = PyPDFLoader("MongoDB-manual.pdf")
pages = loader.load()

Multiple definitions in dictionary at byte 0x7a2c4e for key /Author
Multiple definitions in dictionary at byte 0x7a2c56 for key /Title


In [46]:
print(pages[0])

page_content='MongoDB Documentation
Release 3.0.4
MongoDB Documentation Project
June 29, 2015' metadata={'producer': 'pdfTeX-1.40.15', 'creator': 'LaTeX with hyperref package', 'creationdate': '2015-06-29T10:51:25-04:00', 'author': 'MongoDB Documentation Project', 'title': 'MongoDB Documentation', 'subject': '', 'keywords': '', 'moddate': '2015-06-29T10:51:25-04:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.15 (TeX Live 2014) kpathsea version 6.2.0', 'source': 'MongoDB-manual.pdf', 'total_pages': 946, 'page': 0, 'page_label': '1'}


In [47]:
cleaned_pages = []
for page in pages:
    if len(page.page_content.split(" ")) > 20:
        cleaned_pages.append(page)

In [48]:
# Splitting the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)

In [49]:
schema = {
    "properties": {
        "hasCode": {"type": "boolean"}
    },
    "required": ["hasCode"]
}

In [50]:
llm_model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [51]:
# Creating metadata for the documents
document_transformer = create_metadata_tagger(schema, llm_model)
docs = document_transformer.transform_documents(cleaned_pages)
split_docs = text_splitter.split_documents(docs)

In [52]:
# Generating vector embeddings for the documents
embeddings_model = OpenAIEmbeddings()

In [53]:
chunked_data_collection = client.langchain_demo.chunked_data

In [54]:
# Deleting the collection before adding new data
chunked_data_collection.delete_many({})

DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff000000000000001c'), 'opTime': {'ts': Timestamp(1740937366, 2), 't': 28}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1740937366, 2), 'signature': {'hash': b'J\x80\x1b\xbe\xb4\x9c)\xdb\x17Ls\xa50\xf9\xcb\x90.\xd6\x93\x07', 'keyId': 7431213296700620803}}, 'operationTime': Timestamp(1740937366, 2)}, acknowledged=True)

In [55]:
# Storing the vectors in MongoDB Atlas
vector_store = MongoDBAtlasVectorSearch.from_documents(split_docs,
                                                       embeddings_model,
                                                       collection=chunked_data_collection)

In [56]:
document_count = chunked_data_collection.count_documents({})
print(f"Successfully stored {document_count} documents in MongoDB Atlas")

Successfully stored 6059 documents in MongoDB Atlas


In [57]:
chunked_data_collection.find_one()

{'_id': ObjectId('67c4993b9be56e13b5cdc7d6'),
 'text': 'Contents\n1 Introduction to MongoDB 3\n1.1 What is MongoDB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 3\n2 Install MongoDB 5\n2.1 Recommended Operating Systems for Production Deployments . . . . . . . . . . . . . . . . . . . . 5\n2.2 Other Supported Operating Systems . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5\n2.3 Installation Guides . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5',
 'embedding': [0.006281503941863775,
  0.0026300738099962473,
  0.02037491463124752,
  -0.01728147827088833,
  -0.009469570592045784,
  0.029054809361696243,
  -0.002000291831791401,
  -0.03469347953796387,
  -0.006983074359595776,
  0.00963272713124752,
  0.011003236286342144,
  0.008673369884490967,
  -0.0346151627600193,
  0.0027002308052033186,
  -0.031169310212135315,
  0.00817084964364767,
  0.019147980958223343,
  0.004610786680132

In [59]:
vectorStore = MongoDBAtlasVectorSearch.from_connection_string(
    MONGODB_URI,
    "langchain_demo.chunked_data",
    OpenAIEmbeddings(disallowed_special=()),
    index_name="vector_index",
)

In [60]:
def query_data(query):
    retriever = vectorStore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 3},
    )

    template = """
    Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Do not answer the question if there is no given context.
    Do not answer the question if it is not related to the context.
    Do not give recommendations to anything other than MongoDB.
    Context:
    {context}
    Question: {question}
    """

    custom_rag_prompt = PromptTemplate.from_template(template)

    retrieve = {
        "context": retriever
        | (lambda docs: "\n\n".join([d.page_content for d in docs])),
        "question": RunnablePassthrough(),
    }

    llm_model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    response_parser = StrOutputParser()

    rag_chain = retrieve | custom_rag_prompt | llm_model | response_parser
    answer = rag_chain.invoke(query)
    print(answer)

In [61]:
# Test with a relevant query
question = "When did MongoDB begin supporting multi-document transactions?"
print(f"Running query: {question}")
query_data(question)

print("=========================================================")

# Test with an irrelevant query
question = "Why is the sky blue?"
print(f"Running query: {question}")
query_data(question)

Running query: When did MongoDB begin supporting multi-document transactions?
MongoDB began supporting multi-document transactions in version 4.0, which was released in June 2018.
Running query: Why is the sky blue?
I don't know.
