In [1]:
import sys
sys.path.append("..")

from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

import warnings
warnings.filterwarnings("ignore")

In [2]:
DATA_PATH = "/Users/anuragkotiyal/Desktop/engaige/docs_en"

In [3]:
def load_docs(data_dir):
    
    """ A simple function that loads data from all pdfs inside a directory.

    Args:
        DATA_DIR (path): Path to the directory where pdf documents are stored.

    Returns:
        list(tuple): returns a list of tuples where each tuple contains "page content" of the document and
        some meta deta like "source" and "page number" i.e., [Document(page_content = "Tax information ...",
        metadata = {"source": "doc_1.pdf", "page": 10})].
    """
    
    # create an instance of document loader using the pypdfdirectoryloader from langchain
    document_loader = PyPDFDirectoryLoader(data_dir)
    
    return document_loader.load()

def split_docs_into_chunks(documents, chunk_size = 500, chunk_overlap = 50):
    
    """ A function that split a document into chunks of specific sizes like 500 characters.

    Args:
        documents (list(tuple)): a list of tuples where each tuple contains "page content" of the document and
        some meta deta like "source" and "page number".

    Returns:
        list(tuples): similar to the format in which data came but any document larger than chunk_size has been split
        into multiple chunks.
    """
    
    # create an instance of recursive character text splitter from langchain
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        is_separator_regex = False,
    )
    
    return text_splitter.split_documents(documents)    

def get_chunk_ids(chunks):

    """ A function that creates a unique identifier for each chunk in the database like "docs/doc_name:page_num:chunk_id".
    A chunk id of "docs/doc_1.pdf:10:5" refers to the 5th chunk on page 10 of doc_1.pdf. 

    Returns:
        list(tuple): returns a list of tuples where each tuple contains "page content" of the document and
        some meta deta like "source", "page number", and "chunk_id"
    """

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        # get source and page number from metadata to create a current page id
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # if the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [4]:
documents = load_docs(DATA_PATH)

In [5]:
documents[0:10]

[Document(page_content='Translated from German to English - www.onlinedoctranslator.com', metadata={'source': '/Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf', 'page': 0}),
 Document(page_content='Table of contents\nInformation for your insurance contract\nTax information\nThe small print – in a nutshell: General insurance conditions\nBASIC_PACK_WBWB/ D/1006/ XIII/ 03/22', metadata={'source': '/Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf', 'page': 1}),
 Document(page_content='Information for your insurance contract\n1 Who is your contractual partner\nThe insurer is Standard Life International DAC (90 St Stephens Green, Dublin 2, Ireland, \nregistration number 408507). The address of the branch responsible for you is:\nStandard Life Insurance German \nbranch of Standard Life International \nDAC Lyoner Straße 9\n60528 Frankfurt am Main\nAddress and registered office of the Standard Life \nInsurance branch\nGerman branch of Standard Li

In [6]:
chunks = split_docs_into_chunks(documents)
chunks_with_ids = get_chunk_ids(chunks)

In [7]:
chunks_with_ids[0:10]

[Document(page_content='Translated from German to English - www.onlinedoctranslator.com', metadata={'source': '/Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf', 'page': 0, 'id': '/Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf:0:0'}),
 Document(page_content='Table of contents\nInformation for your insurance contract\nTax information\nThe small print – in a nutshell: General insurance conditions\nBASIC_PACK_WBWB/ D/1006/ XIII/ 03/22', metadata={'source': '/Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf', 'page': 1, 'id': '/Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf:1:0'}),
 Document(page_content='Information for your insurance contract\n1 Who is your contractual partner\nThe insurer is Standard Life International DAC (90 St Stephens Green, Dublin 2, Ireland, \nregistration number 408507). The address of the branch responsible for you is:\nStandard Life Insurance German \nbranch of Sta

In [8]:
print(f"The number of loaded docs were {len(documents)} and they were split into {len(chunks)} chunks.")

The number of loaded docs were 83 and they were split into 494 chunks.


In [9]:
# pre-trained hugging face embedding model used to embed user query and loaded data from pdfs
model_name = "sentence-transformers/all-MiniLM-l6-v2"

# create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name = model_name,     
    model_kwargs = model_kwargs, 
    encode_kwargs = encode_kwargs 
)

In [10]:
text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result[:3]

[-0.038338553160429, 0.12346471846103668, -0.028642943128943443]

In [11]:
print(f"This embedding model creates vector embeddings for chunks in {len(query_result)} dimensions.")

This embedding model creates vector embeddings for chunks in 384 dimensions.


In [12]:
# creating a faiss vector database to run similarity searches for user queries
db = FAISS.from_documents(chunks_with_ids, embeddings)

In [13]:
question = "What types of funds?"
searchDocs = db.similarity_search_with_score(question, k = 3)
for doc, _ in searchDocs:
    print(doc.page_content)
    print(f"Chunk ID - {doc.metadata['id']}")
    print("-----")

5.5 What types of funds do we offer? ................................................................................................................5
5.6 Where can you get detailed information about the funds we offer? ..................................................................5
5.7 What happens if we replace a fund and what does that mean for you? ..................................................5
Chunk ID - /Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf:10:7
-----
• The day on which your lump sum payment is due or
• the second → trading day for your funds after we have received your lump sum payment. 
Your minimum share in each fund you select is 1 percent.
5.5 What types of funds do we offer?
We allocate the funds you choose to the fund assets in your contract. You can choose from three types of 
funds: Standard Life funds, managed portfolios and mutual funds from various fund companies.
Page 6 of 19PA/D/1006/ III/11/21
Chunk ID - /Users/anuragkoti

In [14]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [15]:
query_text = "What type of funds are offered?"

searchDocs = db.similarity_search_with_score(question, k = 3)
context = " ".join([doc.page_content for doc, _ in searchDocs])

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context = context, question = query_text)

In [16]:
print(prompt)

Human: 
Answer the question based only on the following context:

5.5 What types of funds do we offer? ................................................................................................................5
5.6 Where can you get detailed information about the funds we offer? ..................................................................5
5.7 What happens if we replace a fund and what does that mean for you? ..................................................5 • The day on which your lump sum payment is due or
• the second → trading day for your funds after we have received your lump sum payment. 
Your minimum share in each fund you select is 1 percent.
5.5 What types of funds do we offer?
We allocate the funds you choose to the fund assets in your contract. You can choose from three types of 
funds: Standard Life funds, managed portfolios and mutual funds from various fund companies.
Page 6 of 19PA/D/1006/ III/11/21 General Insurance Conditions
b) Your minimum holding in e

In [17]:
# initialise Ollama minstral model -> need to run ollama serve from terminal before using Ollama
model = Ollama(model = "llama3")

In [18]:
def get_answer(question: str):
    
    """ A function that retrives similar documents from our faiss database and pass an enhanced query
    through Ollama minstral model to receive a coherent and concide answer to the base query.

    Args:
        question (str): a question asked by the user.

    Returns:
        str: formatted text response from the minstral model that contains source material for the answer.
    """
    
    # retrieve top 3 relevant document chunks from the database based on user's query 
    results = db.similarity_search_with_score(question, k = 3)
    context = " ".join([doc.page_content for doc, _ in results])
    
    # create a prompt template for Ollama
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    
    # create a proper prompt using langchains LLM prompt
    prompt = prompt_template.format(context = context, question = question)
    
    # get sources of the relevant docs in this case the unique id we created for the chunks
    sources = [doc.metadata.get("id", None) for doc, _score in results]
    
    # get a response to the enhanced query from Ollama
    response_text = model.invoke(prompt)
    
    # format response so that it contains model's answer as well the source documents it used to generate that answer
    formatted_response = f"Response: {response_text}\n\nSources: {sources}"
    
    return formatted_response

In [19]:
# question 1 in English from query.txt
query = "What types of funds are offered?"
answer = get_answer(question = query)

In [20]:
print(answer)

Response: According to the provided context, the types of funds that are offered are:

1. Standard Life funds
2. Managed portfolios
3. Mutual funds from various fund companies

Sources: ['/Users/anuragkotiyal/Desktop/engaige/docs_en/ParkAllee_en.pdf:19:6', '/Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf:10:7', '/Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf:16:0']


In [21]:
# question 2 in English from query.txt
query = "What about Church tax deductions?"
answer = get_answer(question = query)

In [22]:
print(answer)

Response: According to the provided context, from January 1, 2015, church tax on capital gains taxed at a flat rate will be automatically withheld and paid to the religious communities that collect the tax. This means that there is an automatic church tax deduction procedure for withholding tax.

Sources: ['/Users/anuragkotiyal/Desktop/engaige/docs_en/ParkAllee_en.pdf:7:5', '/Users/anuragkotiyal/Desktop/engaige/docs_en/ParkAllee_en.pdf:7:7', '/Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf:6:3']


In [23]:
# question 3 in English from query.txt
query = "Who should I contact if I have questions?"
answer = get_answer(question = query)

In [24]:
print(answer)

Response: According to the provided information, you can contact your broker first if you have any questions about your pension plan or contract. Additionally, you can also contact Standard Life's service staff from Monday to Friday between 9 a.m. and 5 p.m. using one of the following methods:

* Tel.: 0800 2214747 (free of charge)
* Fax: 0800 5892821
* Email: kundenservice@standardlife.de

Sources: ['/Users/anuragkotiyal/Desktop/engaige/docs_en/ParkAllee_en.pdf:4:0', '/Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf:3:0', '/Users/anuragkotiyal/Desktop/engaige/docs_en/ParkAllee_en.pdf:31:5']


In [25]:
# question 4 in English from query.txt
query = "How do you determine amount of payout?"
answer = get_answer(question = query)

In [26]:
print(answer)

Response: According to the provided context, in writing, you determine the amount of the payout.

Sources: ['/Users/anuragkotiyal/Desktop/engaige/docs_en/ParkAllee_en.pdf:12:9', '/Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf:10:10', '/Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf:22:6']


In [27]:
# question 5 in English from query.txt
query = "What are the general insurance conditions?"
answer = get_answer(question = query)

In [28]:
print(answer)

Response: Based on the provided context, the General Insurance Conditions state that:

1. For members of the German Armed Forces, Police or Federal Police who participate in humanitarian aid or peacekeeping operations outside NATO states and die as a result, there is full coverage.
2. If an insured person dies directly or indirectly due to war events or unrest (except for cases mentioned in 1), the insurer will pay out the surrender value instead of the death benefit.

Additionally, the conditions mention that the ParkAllee unit-linked pension insurance provides:

* Insurance cover by paying a lifelong monthly pension if the policyholder lives to see the pension start date.
* Insurance cover in certain circumstances where the underlying funds or managed portfolio is no longer available for purchase or sale, or the strategy changes and no longer aligns with the investment objectives.

Sources: ['/Users/anuragkotiyal/Desktop/engaige/docs_en/Basispaket+WeitBlick_en.pdf:14:0', '/Users/anur