## Building the Knowledge Base

In [None]:
# pip install pypdf

In [None]:
# pip install openai

In [None]:
import os
os.environ["OPENAI_API_KEY"] = # enter open AI key


from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT
from langchain.chains.llm import LLMChain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.memory import ConversationBufferMemory
from langchain.llms import OpenAI

In [None]:
# !curl -o paper.pdf https://arxiv.org/pdf/1706.03762.pdf

In [None]:
pdf_path = "./paper.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load_and_split()

In [None]:
len(docs[1].page_content)

In [1]:
!pip freeze | grep pymongo

pymongo==4.5.0


In [None]:
from pymongo import MongoClient

embeddings = OpenAIEmbeddings()

connection_str = "<mongodb cluster URI>" 
client = MongoClient(connection_str)

db_name = "<mongodb db name>"
db = client[db_name]

collection_name = "<mongodb collection name>"
collection = db[collection_name]
index_name = "<mongodb index name>"

# insert the documents in MongoDB Atlas with their embedding
vectordb = MongoDBAtlasVectorSearch.from_documents(
    docs, embeddings, collection=collection, index_name=index_name
)

# reference Atlas collection without recreating documents
# vectordb = MongoDBAtlasVectorSearch.from_connection_string(
#     connection_str,
#     db_name + "." + collection_name,
#     embeddings,
#     index_name=index_name
# )

model = {
    "name": index_name,
    "definition": {
        "mappings": {
            "dynamic": true,
            "fields": {
                "embedding": {
                "dimensions": 1536,
                "similarity": "cosine",
                "type": "knnVector"
                }
            }
        }
    }
}

collection.create_search_index(model=model)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

pdf_qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0.0),
                                               vectordb.as_retriever(),
                                               memory=memory,
                                               return_source_documents=True,
                                               search_kwargs={'k': 5})

In [None]:
query = "What are limits of the attention mechanism?"
chat_history = []
result = pdf_qa({"question": query, "chat_history": chat_history})
print(result["answer"])