## Exploration of atlas for RAG

### Import statements and dependencies

In [2]:
import dotenv
import os
from bson import ObjectId
from langchain import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_transformers.openai_functions import create_metadata_tagger
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from pymongo import MongoClient
from rich import print as rprint

dotenv.load_dotenv()

True

### Databse Ping

In [34]:
MONGODB_URI = os.getenv("MONGODB_URI")

try:
    client = MongoClient(MONGODB_URI)
    db = client["atlasexploration"]
    print("Connection successful!")
except Exception as e:
    print("Connection failed:", e)

Connection successful!


### Databse Setup

In [7]:
client = MongoClient(os.getenv("MONGODB_URI"))

dbName = "atlas_exploration"
collectionName = "chunked_data"
index = "vector_index"
collection = client[dbName][collectionName]

### Loading the document

In [19]:
loader = PyPDFLoader("../data/M9A_3rd_Edition.pdf")
pages = loader.load()
rprint(pages[3])

### Sanitization of data

In [10]:
# removing pages with too little content

cleaned_pages = []
for page in pages:
    if len(page.page_content.split(" ")) > 20:
        cleaned_pages.append(page)

rprint(len(cleaned_pages))
rprint(cleaned_pages[1])

## Document enhancements

In [11]:
# splitting documents 

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)

schema = {
    "properties": {
        "title": {"type": "array", "items": {"type": "string"}},
        "hasCode": {"type": "boolean"},
    },
    "required": ["title", "keywords", "hasCode"],
}

llm = ChatOpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"), temperature=0, model="gpt-3.5-turbo")

document_transformer = create_metadata_tagger(metadata_schema=schema, llm=llm)
docs = document_transformer.transform_documents(cleaned_pages)
split_docs = text_splitter.split_documents(docs)

rprint(len(split_docs))
rprint(split_docs[20])
rprint(split_docs[21])

### Embedding Generation & Vector Store setup

In [14]:
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

vectorStore = MongoDBAtlasVectorSearch.from_documents(
    split_docs, embeddings, collection=collection
)

### Vector Store Retriever and Vector Store must use the same embedding model

In [9]:
vectorStoreRetriever = MongoDBAtlasVectorSearch.from_connection_string(
    os.getenv("MONGODB_URI"),
    dbName + "." + collectionName,
    OpenAIEmbeddings(disallowed_special=(), openai_api_key=os.getenv("OPENAI_API_KEY")),
    index_name=index,
)

### Function for querying data

In [39]:
retriever = vectorStoreRetriever.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 3, "pre_filter": {"hasCode": {"$eq": False}}},
    )

def query_data_similarity(query, retriever):
    """
    k: Number of results.
    """

    results = retriever.invoke(query)
    rprint(results)

    return results


def query_data_mmr(query, vectorStore):
    """
    k: Number of results.
    lambda: Balance between relevance and diversity (typically between 0 and 1).
    ----------------------------------------------------------------------------
    Retrieve more documents with higher diversity useful if your dataset has many 
    similar documents
    """

    retriever = vectorStore.as_retriever(
        search_type="mmr",
        search_kwargs={"k": 3, "lambda": 0.5},
    )

    results = retriever.invoke(query)
    filtered_results = [{"text": result.text, "metadata": result.metadata} for result in results]

    rprint(filtered_results)

    return filtered_results


def query_data_threshold(query, vectorStore):
    """
    k: Number of results.
    alpha: Controls the weight between vector similarity and keyword relevance, with 0.5 being an equal balance.
    ------------------------------------------------------------------------------------------------------------
    Only retrieve documents that have a relevance score above a certain threshold
    """
    
    retriever = vectorStore.as_retriever(
        search_type="similarity_score_threshold",
    )

    results = retriever.invoke(query)
    rprint(results)

    return results


### Vector Search based on Similarity

In [40]:
query = "What is Collective Investment Scheme (CIS)"

query_data_similarity(query, retriever)

[Document(metadata={'_id': '671c7fd2d979057b1f309d21', 'title': ['Participation Products', 'Tracker certificate', 'Bonus certificate', 'Airbag certificate'], 'hasCode': False, 'source': '../data/M9A_3rd_Edition.pdf', 'page': 35}, page_content='• Collective Investment Scheme (CIS) – Code on CIS (the “Code”) issued\nby the MAS.\n• Investment-linked Policy - Insurance Act 1966.\nSuitability of structured products •knowing your client  - his investment objectives, risk appetite, time\nhorizon, financial position, investment knowledge and experience.\n•know the products under consideration, so that the product features\nand risk factors can be explained to the client in a way that he canunderstand.\nInvestment objectives •\nSafety.\n•Income.'),
 Document(metadata={'_id': '671c7fd2d979057b1f309cdf', 'title': ['Module 9A: Life Insurance and Investment -Linked Policies II'], 'hasCode': False, 'source': '../data/M9A_3rd_Edition.pdf', 'page': 27}, page_content='Module 9A: Life Insurance and Inve

### Function for getting a document by id

In [35]:
def get_documents_by_ids(ids, collection):
    object_ids = [ObjectId(id) for id in ids]
    documents = collection.find({"_id": {"$in": object_ids}})
    result = [{"_id": document["_id"], "text": document["text"], "title": document["title"], "source": document["source"], "page": document["page"]} for document in documents]
    return result

ids = ["671c7fd2d979057b1f309c45", "671c7fd2d979057b1f309c46"]
documents = get_documents_by_ids(ids, collection)
rprint(documents)

### Prompt Engineering

In [44]:
template = """
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Do not answer the question if there is no given context.
Do no answer the question if it is not related to the context.
Context:
{context}
Question: {question}
"""
custom_rag_prompt = PromptTemplate.from_template(template)
retrieve = {
    "context": retriever | (lambda docs: "\n\n".join([d.page_content for d in docs])),
    "question": RunnablePassthrough()
}

llm = ChatOpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"), temperature=0)

response_parser = StrOutputParser()

rag_chain = (
    retrieve
    | custom_rag_prompt
    | llm
    | response_parser
)

rprint(rag_chain.invoke(query))