## <font color = red> Install and Import the Required Libraries

In [1]:
# Install all the required libraries
!pip install -U -q pdfplumber tiktoken openai chromadb sentence-transformers

In [2]:
# Import all the required Libraries

import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
import openai
import chromadb
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


##  <font color = red> Read, Process, and Chunk the PDF File

In [4]:
# Define the path of the PDF
insurance_policy_pdf='/content/drive/MyDrive/Upgrad/GenAI/Principal-Sample-Life-Insurance-Policy.pdf'

#### <font color = red> Reading a page from PDF file and exploring it through pdfplumber

In [5]:
# Open the PDF file
with pdfplumber.open(insurance_policy_pdf) as pdf:

    # Get one of the pages from the PDF and examine it
    single_page = pdf.pages[2]

    # Extract text from the first page
    text = single_page.extract_text()

    # Extract tables from the first page
    tables = single_page.extract_tables()

    # Print the extracted text
    print(text)

POLICY RIDER
GROUP INSURANCE
POLICY NO: S655
COVERAGE: Life
EMPLOYER: RHODE ISLAND JOHN DOE
Effective on the later of the Date of Issue of this Group Policy or March 1, 2005, the following
will apply to your Policy:
From time to time The Principal may offer or provide certain employer groups who apply
for coverage with The Principal a Financial Services Hotline and Grief Support Services or
any other value added service for the employees of that employer group. In addition, The
Principal may arrange for third party service providers (i.e., optometrists, health clubs), to
provide discounted goods and services to those employer groups who apply for coverage
with The Principal or who become insureds/enrollees of The Principal. While The
Principal has arranged these goods, services and/or third party provider discounts, the third
party service providers are liable to the applicants/insureds/enrollees for the provision of
such goods and/or services. The Principal is not responsible for the 

In [6]:
# View the table in the page, if any

tables

[]



```
# This is formatted as code
```

#### <font color = red> Extracting text from PDF

In [7]:
# Function to check whether a word is present in a table or not for segregation of regular text and tables

def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

In [8]:
# Function to extract text from a PDF file.
# 1. Declare a variable p to store the iteration of the loop that will help us store page numbers alongside the text
# 2. Declare an empty list 'full_text' to store all the text files
# 3. Use pdfplumber to open the pdf pages one by one
# 4. Find the tables and their locations in the page
# 5. Extract the text from the tables in the variable 'tables'
# 6. Extract the regular words by calling the function check_bboxes() and checking whether words are present in the table or not
# 7. Use the cluster_objects utility to cluster non-table and table words together so that they retain the same chronology as in the original PDF
# 8. Declare an empty list 'lines' to store the page text
# 9. If a text element in present in the cluster, append it to 'lines', else if a table element is present, append the table
# 10. Append the page number and all lines to full_text, and increment 'p'
# 11. When the function has iterated over all pages, return the 'full_text' list

def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no, " ".join(lines)])
            p +=1

    return full_text

In [9]:
#Initialize an empty list to store the extracted texts and document names
data = []

# Process the PDF file
print(f"...Processing PDF")

# Call the function to extract the text from the PDF
extracted_text = extract_text_from_pdf(insurance_policy_pdf)

# Convert the extracted list to a PDF, and add a column to store document names
extracted_text_df = pd.DataFrame(extracted_text, columns=['Page No.', 'Page_Text'])

# Append the extracted text and document name to the list
data.append(extracted_text_df)

# Print a message to indicate progress
print(f"Finished processing PDF")

...Processing PDF
Finished processing PDF


In [10]:
# Concatenate all the DFs in the list 'data' together

insurance_pdf_data = pd.concat(data, ignore_index=True)

In [11]:
insurance_pdf_data

Unnamed: 0,Page No.,Page_Text
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...
1,Page 2,This page left blank intentionally
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...
3,Page 4,This page left blank intentionally
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...
...,...,...
59,Page 60,I f a Dependent who was insured dies during th...
60,Page 61,Section D - Claim Procedures Article 1 - Notic...
61,Page 62,A claimant may request an appeal of a claim de...
62,Page 63,This page left blank intentionally


In [12]:
# Check one of the extracted page texts to ensure that the text has been correctly read

insurance_pdf_data.Page_Text[2]

'POLICY RIDER GROUP INSURANCE POLICY NO: S655 COVERAGE: Life EMPLOYER: RHODE ISLAND JOHN DOE Effective on the later of the Date of Issue of this Group Policy or March 1, 2005, the following will apply to your Policy: From time to time The Principal may offer or provide certain employer groups who apply for coverage with The Principal a Financial Services Hotline and Grief Support Services or any other value added service for the employees of that employer group. In addition, The Principal may arrange for third party service providers (i.e., optometrists, health clubs), to provide discounted goods and services to those employer groups who apply for coverage with The Principal or who become insureds/enrollees of The Principal. While The Principal has arranged these goods, services and/or third party provider discounts, the third party service providers are liable to the applicants/insureds/enrollees for the provision of such goods and/or services. The Principal is not responsible for the

In [13]:
# Let's also check the length of all the texts as there might be some empty pages or pages with very few words that we can drop

insurance_pdf_data['Text_Length'] = insurance_pdf_data['Page_Text'].apply(lambda x: len(x.split(' ')))

In [14]:
insurance_pdf_data['Text_Length']

Unnamed: 0,Text_Length
0,30
1,5
2,230
3,5
4,110
...,...
59,285
60,418
61,322
62,5


In [15]:
# Retain only the rows with a text length of at least 10

insurance_pdf_data = insurance_pdf_data.loc[insurance_pdf_data['Text_Length'] >= 10]
insurance_pdf_data

Unnamed: 0,Page No.,Page_Text,Text_Length
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,30
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,230
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,110
5,Page 6,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,153
6,Page 7,Section A – Eligibility Member Life Insurance ...,176
7,Page 8,Section A - Member Life Insurance Schedule of ...,171
8,Page 9,P ART I - DEFINITIONS When used in this Group ...,387
9,Page 10,T he legally recognized union of two eligible ...,251
10,Page 11,(2) has been placed with the Member or spouse ...,299
11,Page 12,An institution that is licensed as a Hospital ...,352


In [16]:
# Store the metadata for each page in a separate column

insurance_pdf_data['Metadata'] = insurance_pdf_data.apply(lambda x: {
    'Page_No.': x['Page No.']}, axis=1)

In [17]:
insurance_pdf_data['Metadata']


Unnamed: 0,Metadata
0,{'Page_No.': 'Page 1'}
2,{'Page_No.': 'Page 3'}
4,{'Page_No.': 'Page 5'}
5,{'Page_No.': 'Page 6'}
6,{'Page_No.': 'Page 7'}
7,{'Page_No.': 'Page 8'}
8,{'Page_No.': 'Page 9'}
9,{'Page_No.': 'Page 10'}
10,{'Page_No.': 'Page 11'}
11,{'Page_No.': 'Page 12'}



## <font color = red> Evaluating Chunking Strategy and Embedding Layer and Store Embeddings in ChromaDB

In [18]:
import pandas as pd
import spacy
import openai
import chromadb
import numpy as np
import uuid
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

# Load NLP models
nlp = spacy.load("en_core_web_sm")
embedding_model_st = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [19]:
# Set the API key
filepath = "/content/drive/MyDrive/Upgrad/GenAI/"

with open(filepath + "OpenAI_API_Key.txt", "r") as f:
  openai.api_key = ' '.join(f.readlines())

In [20]:
# Initialize ChromaDB client
client = chromadb.PersistentClient(path="chroma_db")

### <font color = red> Approach 1:
**Chunking:** Fixed Length

**`Embedding Model:`** text-embedding-ada-002

#### <font color = red> Chunking and Embedding

In [21]:
# Delete previous collections
client.delete_collection(name="RAG_on_Insurance")
client.delete_collection(name="Insurance_Cache")

# Optionally, clear all collections in ChromaDB
for collection_name in client.list_collections():
    client.delete_collection(name=collection_name)

# Reset variables
documents_list = []
metadata_list = []

In [22]:
# Function to split text into fixed-size chunks
def split_text_into_chunks(text, chunk_size=500, overlap=50):
    """Splits text into chunks of approximately chunk_size, ensuring word boundaries and overlap."""
    words = text.split()

    # If page has fewer words than chunk_size, return as a single chunk
    if len(words) <= chunk_size:
        return [" ".join(words)]

    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap  # Move start index with overlap

    return chunks

In [23]:
def process_pages(insurance_pdf_data, chunk_size=500, overlap=50):
    """Processes all pages, chunking them intelligently, and handling short pages."""
    all_chunks = []

    # Iterate through all pages
    for idx, row in insurance_pdf_data.iterrows():
        page_no = row["Page No."]
        page_text = row["Page_Text"]
        metadata = row["Metadata"]

        # Get chunks for the page
        text_chunks = split_text_into_chunks(page_text, chunk_size, overlap)

        # Merge small pages with the next one if possible
        if len(text_chunks) == 1 and idx < len(insurance_pdf_data) - 1:
            next_page_text = insurance_pdf_data.iloc[idx + 1]["Page_Text"]
            merged_text = page_text + " " + next_page_text  # Combine pages
            text_chunks = split_text_into_chunks(merged_text, chunk_size, overlap)

        # Create structured data for each chunk
        for i, chunk in enumerate(text_chunks):
            all_chunks.append({
                "Title": page_no,
                "Chunk Text": chunk,
                "Metadata": {**metadata, "Chunk_No.": i}
            })

    return pd.DataFrame(all_chunks)

In [24]:
# Process pages
fixed_chunk_df = process_pages(insurance_pdf_data)
fixed_chunk_df.head()

Unnamed: 0,Title,Chunk Text,Metadata
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,"{'Page_No.': 'Page 1', 'Chunk_No.': 0}"
1,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,"{'Page_No.': 'Page 3', 'Chunk_No.': 0}"
2,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,"{'Page_No.': 'Page 5', 'Chunk_No.': 0}"
3,Page 6,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,"{'Page_No.': 'Page 6', 'Chunk_No.': 0}"
4,Page 6,"this provision if, as a result of sickness or ...","{'Page_No.': 'Page 6', 'Chunk_No.': 1}"


In [25]:
# Set up the embedding function using the OpenAI embedding model

model = "text-embedding-ada-002"
embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)

In [26]:
# Initialise a collection in chroma and pass the embedding_function to it so that it used OpenAI embeddings to embed the documents

insurance_collection = client.get_or_create_collection(name='RAG_on_Insurance', embedding_function=embedding_function)

In [27]:
# Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma

documents_list = fixed_chunk_df["Chunk Text"].tolist()
metadata_list = fixed_chunk_df['Metadata'].tolist()

In [28]:
# Add the documents and metadata to the collection alongwith generic integer IDs. You can also feed the metadata information as IDs by combining the policy name and page no.
insurance_collection.add(
    documents= documents_list,
    ids = [str(i) for i in range(0, len(documents_list))],
    metadatas = metadata_list
)

In [29]:
# Let's take a look at the first few entries in the collection
insurance_collection.get(
    ids = ['0','1','2'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0', '1', '2'],
 'embeddings': array([[-1.33636752e-02,  1.74220726e-02, -1.51126357e-02, ...,
         -2.16627959e-02, -1.17582118e-03,  3.78153520e-03],
        [-8.57189484e-03,  1.22174909e-02,  1.93651125e-04, ...,
         -1.04421256e-02,  7.31829740e-03, -8.86200069e-05],
        [-1.70365907e-02,  2.05252226e-02,  1.82956457e-03, ...,
         -2.14039385e-02, -1.26692429e-02,  2.98205926e-03]]),
 'documents': ['DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/01/2014 711 HIGH STREET GEORGE RI 02903 GROUP POLICY FOR: RHODE ISLAND JOHN DOE ALL MEMBERS Group Member Life Insurance Print Date: 07/16/2014 POLICY RIDER GROUP INSURANCE POLICY NO: S655 COVERAGE: Life EMPLOYER: RHODE ISLAND JOHN DOE Effective on the later of the Date of Issue of this Group Policy or March 1, 2005, the following will apply to your Policy: From time to time The Principal may offer or provide certain employer groups who apply for coverage with The Principal a Financial Services Hotline and Grief Su

In [30]:
cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)

In [31]:
cache_collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

#### <font color = red> Semantic Search with Cache
In this section, we will perform a semantic search of a query in the collections embeddings to get several top semantically similar results.

In [32]:
# Read the user query

query = 'What is the life insurance coverage for disability?'

In [33]:
# Searh the Cache collection first
# Query the collection against the user query and best result

cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

In [34]:
cache_results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'data': None,
 'metadatas': [[]],
 'distances': [[]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [35]:
results = insurance_collection.query(
query_texts=query,
n_results=10
)
print("Result size is : " + str(len(results.items())))
results.items()

Result size is : 8


dict_items([('ids', [['78', '56', '82', '73', '75', '80', '6', '72', '61', '59']]), ('embeddings', None), ('documents', [["Payment of benefits will be subject to the Beneficiary and Facility of Payment provisions of this PART IV, Section A. Article 6 - Member Life Insurance - Coverage During Disability A Member may be eligible to continue his or her Member Life and Member Accidental Death and Dismemberment Insurance and Dependent Life Insurance coverage during the Member's ADL Disability or Total Disability. a. Coverage Qualification To be qualified for Coverage During Disability, a Member must: (1) become ADL Disabled or Totally Disabled while insured for Member Life Insurance; and (2) become ADL Disabled or Totally Disabled prior to the attainment of age 60; and (3) remain ADL Disabled or Totally Disabled continuously; and (4) be under the regular care and attendance of a Physician; and (5) send proof of ADL Disability or Total Disability to The Principal when required; and (6) submi

In [36]:

def search(query):

  # Set a threshold for cache search
  threshold = 0.2

  ids = []
  documents = []
  distances = []
  metadatas = []
  results_df = pd.DataFrame()

  # try to find from cache
  cache_results = cache_collection.query(
      query_texts=query,
      n_results=1
  )

  # If the distance is greater than the threshold, then return the results from the main collection.
  if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
        # Query the collection against the user query and return the top 10 results
        results = insurance_collection.query(
        query_texts=query,
        n_results=10
        )

        # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
        # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
        Keys = []
        Values = []

        size = len(results.items())

        for key, val in results.items():
          if val is None:
            continue
          for i in range(size):
            Keys.append(str(key)+str(i))
            Values.append(str(val[0][i]))


        cache_collection.add(
            documents= [query],
            ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
            metadatas = dict(zip(Keys, Values))
        )

        #print("Not found in cache. Found in main collection.")

        result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
        results_df = pd.DataFrame.from_dict(result_dict)
        return results_df


  # If the distance is, however, less than the threshold, you can return the results from cache

  elif cache_results['distances'][0][0] <= threshold:
        cache_result_dict = cache_results['metadatas'][0][0]

        # Loop through each inner list and then through the dictionary
        for key, value in cache_result_dict.items():
            if 'ids' in key:
                ids.append(value)
            elif 'documents' in key:
                documents.append(value)
            elif 'distances' in key:
                distances.append(value)
            elif 'metadatas' in key:
                metadatas.append(value)

        #print("Found in cache!")

        # Create a DataFrame
        return pd.DataFrame({
          'IDs': ids,
          'Documents': documents,
          'Distances': distances,
          'Metadatas': metadatas
        })


In [37]:
cache_results_df = search(query)
cache_results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs
0,"{'Chunk_No.': 0, 'Page_No.': 'Page 49'}",Payment of benefits will be subject to the Ben...,0.271217,78
1,"{'Chunk_No.': 0, 'Page_No.': 'Page 38'}",Section D - Continuation Article 1 - Member Li...,0.294016,56
2,"{'Chunk_No.': 0, 'Page_No.': 'Page 51'}",Coverage During Disability will cease on the e...,0.296265,82
3,"{'Chunk_No.': 1, 'Page_No.': 'Page 46'}",under his or her Individual Purchase Rights as...,0.298757,73
4,"{'Chunk_No.': 1, 'Page_No.': 'Page 47'}",required more than once each year. The Princip...,0.30099,75
5,"{'Chunk_No.': 0, 'Page_No.': 'Page 50'}",The Principal may require that a ADL Disabled ...,0.302744,80
6,"{'Chunk_No.': 0, 'Page_No.': 'Page 8'}",Section A - Member Life Insurance Schedule of ...,0.307557,6
7,"{'Chunk_No.': 0, 'Page_No.': 'Page 46'}",PART IV - BENEFITS Section A - Member Life Ins...,0.308006,72
8,"{'Chunk_No.': 1, 'Page_No.': 'Page 40'}",less any Accelerated Benefit payment as descri...,0.309722,61
9,"{'Chunk_No.': 1, 'Page_No.': 'Page 39'}",to Active Work within 31 days; or (4) the Memb...,0.328321,59


#### <font color = red>  Re-Ranking with a Cross Encoder

In [38]:
# Import the CrossEncoder library from sentence_transformers

from sentence_transformers import CrossEncoder, util

In [39]:
# Initialise the cross encoder model

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [40]:
# Test the cross encoder model

scores = cross_encoder.predict([['Does the insurance cover diabetic patients?', 'The insurance policy covers some pre-existing conditions including diabetes, heart diseases, etc. The policy does not howev'],
                                ['Does the insurance cover diabetic patients?', 'The premium rates for various age groups are given as follows. Age group (<18 years): Premium rate']])

In [41]:
scores

array([  3.8467607, -11.252879 ], dtype=float32)

In [42]:
def apply_cross_encoder(query, df):
  cross_inputs = [[query, response] for response in df['Documents']]
  cross_rerank_scores = cross_encoder.predict(cross_inputs)
  df['Reranked_scores'] = cross_rerank_scores
  return df

In [43]:
# Apply Cross Encoder
cache_results_df = apply_cross_encoder(query, cache_results_df)
cache_results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Chunk_No.': 0, 'Page_No.': 'Page 49'}",Payment of benefits will be subject to the Ben...,0.271217,78,1.616178
1,"{'Chunk_No.': 0, 'Page_No.': 'Page 38'}",Section D - Continuation Article 1 - Member Li...,0.294016,56,0.167269
2,"{'Chunk_No.': 0, 'Page_No.': 'Page 51'}",Coverage During Disability will cease on the e...,0.296265,82,-1.497242
3,"{'Chunk_No.': 1, 'Page_No.': 'Page 46'}",under his or her Individual Purchase Rights as...,0.298757,73,-6.586439
4,"{'Chunk_No.': 1, 'Page_No.': 'Page 47'}",required more than once each year. The Princip...,0.30099,75,-2.618375
5,"{'Chunk_No.': 0, 'Page_No.': 'Page 50'}",The Principal may require that a ADL Disabled ...,0.302744,80,-1.73845
6,"{'Chunk_No.': 0, 'Page_No.': 'Page 8'}",Section A - Member Life Insurance Schedule of ...,0.307557,6,-1.504811
7,"{'Chunk_No.': 0, 'Page_No.': 'Page 46'}",PART IV - BENEFITS Section A - Member Life Ins...,0.308006,72,-0.025323
8,"{'Chunk_No.': 1, 'Page_No.': 'Page 40'}",less any Accelerated Benefit payment as descri...,0.309722,61,-1.231255
9,"{'Chunk_No.': 1, 'Page_No.': 'Page 39'}",to Active Work within 31 days; or (4) the Memb...,0.328321,59,-0.863105


In [44]:
# Function to return top 3 results after reranking

def get_topn(n, df):
  top_3_rerank = df.sort_values(by='Reranked_scores', ascending=False)
  return top_3_rerank[["Documents", "Metadatas"]][:n]

In [45]:
top_3_RAG = get_topn(3, cache_results_df)


In [46]:
top_3_RAG

Unnamed: 0,Documents,Metadatas
0,Payment of benefits will be subject to the Ben...,"{'Chunk_No.': 0, 'Page_No.': 'Page 49'}"
1,Section D - Continuation Article 1 - Member Li...,"{'Chunk_No.': 0, 'Page_No.': 'Page 38'}"
7,PART IV - BENEFITS Section A - Member Life Ins...,"{'Chunk_No.': 0, 'Page_No.': 'Page 46'}"


### <font color = red> Approach 2:
**Chunking:** Sentence Based

**`Embedding Model:`** text-embedding-ada-002

#### <font color = red> Chunking and Embedding

In [47]:
# Delete previous collections
client.delete_collection(name="RAG_on_Insurance")
client.delete_collection(name="Insurance_Cache")

# Optionally, clear all collections in ChromaDB
for collection_name in client.list_collections():
    client.delete_collection(name=collection_name)

# Reset variables
documents_list = []
metadata_list = []

In [48]:
# Function to split text into sentence chunks
def split_by_sentence(text, max_sentences=3):
    """Splits text into chunks based on sentences."""
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return [" ".join(sentences[i:i+max_sentences]) for i in range(0, len(sentences), max_sentences)]

In [49]:
def process_pages(insurance_pdf_data):
    """Processes all pages, chunking them intelligently, and handling short pages."""
    all_chunks = []

    # Iterate through all pages
    for idx, row in insurance_pdf_data.iterrows():
        page_no = row["Page No."]
        page_text = row["Page_Text"]
        metadata = row["Metadata"]

        # Get chunks for the page
        text_chunks = split_by_sentence(page_text)

        # Merge small pages with the next one if possible
        if len(text_chunks) == 1 and idx < len(insurance_pdf_data) - 1:
            next_page_text = insurance_pdf_data.iloc[idx + 1]["Page_Text"]
            merged_text = page_text + " " + next_page_text  # Combine pages
            text_chunks = split_by_sentence(merged_text)

        # Create structured data for each chunk
        for i, chunk in enumerate(text_chunks):
            all_chunks.append({
                "Title": page_no,
                "Chunk Text": chunk,
                "Metadata": {**metadata, "Chunk_No.": i}
            })

    return pd.DataFrame(all_chunks)

In [50]:
# Process pages
sentence_chunk_df = process_pages(insurance_pdf_data)
sentence_chunk_df.head()

Unnamed: 0,Title,Chunk Text,Metadata
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,"{'Page_No.': 'Page 1', 'Chunk_No.': 0}"
1,Page 1,Effective on the later of the Date of Issue of...,"{'Page_No.': 'Page 1', 'Chunk_No.': 1}"
2,Page 1,The Principal is not responsible for the provi...,"{'Page_No.': 'Page 1', 'Chunk_No.': 2}"
3,Page 1,"PRINCIPAL LIFE INSURANCE COMPANY DES MOINES, I...","{'Page_No.': 'Page 1', 'Chunk_No.': 3}"
4,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,"{'Page_No.': 'Page 3', 'Chunk_No.': 0}"


In [51]:
# Set up the embedding function using the OpenAI embedding model

model = "text-embedding-ada-002"
embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)

In [52]:
# Initialise a collection in chroma and pass the embedding_function to it so that it used OpenAI embeddings to embed the documents

insurance_collection = client.get_or_create_collection(name='RAG_on_Insurance', embedding_function=embedding_function)

In [53]:
# Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma

documents_list = sentence_chunk_df["Chunk Text"].tolist()
metadata_list = sentence_chunk_df['Metadata'].tolist()

In [54]:
# Add the documents and metadata to the collection alongwith generic integer IDs. You can also feed the metadata information as IDs by combining the policy name and page no.
insurance_collection.add(
    documents= documents_list,
    ids = [str(i) for i in range(0, len(documents_list))],
    metadatas = metadata_list
)

In [55]:
# Let's take a look at the first few entries in the collection
insurance_collection.get(
    ids = ['0','1','2'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0', '1', '2'],
 'embeddings': array([[-0.01971101,  0.01658912, -0.02656871, ..., -0.03769779,
          0.00519226, -0.00018359],
        [-0.00864939,  0.00719046,  0.00579405, ...,  0.00545363,
         -0.00615878,  0.00226135],
        [-0.00649273,  0.00599733, -0.02178461, ..., -0.012244  ,
          0.00627292,  0.00649929]]),
 'documents': ['DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/01/2014 711 HIGH STREET GEORGE RI 02903 GROUP POLICY FOR: RHODE ISLAND JOHN DOE ALL MEMBERS Group Member Life Insurance Print Date: 07/16/2014 POLICY RIDER GROUP INSURANCE POLICY NO: S655 COVERAGE: Life EMPLOYER: RHODE ISLAND JOHN DOE',
  'Effective on the later of the Date of Issue of this Group Policy or March 1, 2005, the following will apply to your Policy: From time to time The Principal may offer or provide certain employer groups who apply for coverage with The Principal a Financial Services Hotline and Grief Support Services or any other value added service for the employees o

In [56]:
cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)

In [57]:
cache_collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

#### <font color = red> Semantic Search with Cache
In this section, we will perform a semantic search of a query in the collections embeddings to get several top semantically similar results.

In [58]:
# Read the user query

query = 'What is the life insurance coverage for disability?'

In [59]:
# Searh the Cache collection first
# Query the collection against the user query and best result

cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

In [60]:
cache_results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'data': None,
 'metadatas': [[]],
 'distances': [[]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [61]:
results = insurance_collection.query(
query_texts=query,
n_results=10
)
print("Result size is : " + str(len(results.items())))
results.items()

Result size is : 8


dict_items([('ids', [['181', '188', '137', '155', '154', '20', '139', '192', '15', '187']]), ('embeddings', None), ('documents', [["Payment of benefits will be subject to the Beneficiary and Facility of Payment provisions of this PART IV, Section A. Article 6 - Member Life Insurance - Coverage During Disability A Member may be eligible to continue his or her Member Life and Member Accidental Death and Dismemberment Insurance and Dependent Life Insurance coverage during the Member's ADL Disability or Total Disability. a. Coverage Qualification To be qualified for Coverage During Disability, a Member must: (1) become ADL Disabled or Totally Disabled while insured for Member Life Insurance; and (2) become ADL Disabled or Totally Disabled prior to the attainment of age 60; and (3) remain ADL Disabled or Totally Disabled continuously; and (4) be under the regular care and attendance of a Physician; and (5) send proof of ADL Disability or Total Disability to The Principal when required; and 

In [62]:

def search(query):

  # Set a threshold for cache search
  threshold = 0.2

  ids = []
  documents = []
  distances = []
  metadatas = []
  results_df = pd.DataFrame()

  # try to find from cache
  cache_results = cache_collection.query(
      query_texts=query,
      n_results=1
  )

  # If the distance is greater than the threshold, then return the results from the main collection.
  if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
        # Query the collection against the user query and return the top 10 results
        results = insurance_collection.query(
        query_texts=query,
        n_results=10
        )

        # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
        # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
        Keys = []
        Values = []

        size = len(results.items())

        for key, val in results.items():
          if val is None:
            continue
          for i in range(size):
            Keys.append(str(key)+str(i))
            Values.append(str(val[0][i]))


        cache_collection.add(
            documents= [query],
            ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
            metadatas = dict(zip(Keys, Values))
        )

        #print("Not found in cache. Found in main collection.")

        result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
        results_df = pd.DataFrame.from_dict(result_dict)
        return results_df


  # If the distance is, however, less than the threshold, you can return the results from cache

  elif cache_results['distances'][0][0] <= threshold:
        cache_result_dict = cache_results['metadatas'][0][0]

        # Loop through each inner list and then through the dictionary
        for key, value in cache_result_dict.items():
            if 'ids' in key:
                ids.append(value)
            elif 'documents' in key:
                documents.append(value)
            elif 'distances' in key:
                distances.append(value)
            elif 'metadatas' in key:
                metadatas.append(value)

        #print("Found in cache!")

        # Create a DataFrame
        return pd.DataFrame({
          'IDs': ids,
          'Documents': documents,
          'Distances': distances,
          'Metadatas': metadatas
        })


In [63]:
cache_results_df = search(query)
cache_results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs
0,"{'Chunk_No.': 0, 'Page_No.': 'Page 49'}",Payment of benefits will be subject to the Ben...,0.251205,181
1,"{'Chunk_No.': 3, 'Page_No.': 'Page 50'}",Premium will not be charged for Member Life an...,0.253982,188
2,"{'Chunk_No.': 0, 'Page_No.': 'Page 38'}",Section D - Continuation Article 1 - Member Li...,0.290602,137
3,"{'Chunk_No.': 2, 'Page_No.': 'Page 43'}",The Dependent will not be required to submit P...,0.291126,155
4,"{'Chunk_No.': 1, 'Page_No.': 'Page 43'}",(2) If termination is as described in b. (2) a...,0.297987,154
5,"{'Chunk_No.': 0, 'Page_No.': 'Page 8'}",Section A - Member Life Insurance Schedule of ...,0.300341,20
6,"{'Chunk_No.': 2, 'Page_No.': 'Page 38'}",If a Member qualifies for continuation under m...,0.302779,139
7,"{'Chunk_No.': 0, 'Page_No.': 'Page 51'}",Coverage During Disability will cease on the e...,0.308696,192
8,"{'Chunk_No.': 0, 'Page_No.': 'Page 7'}",Section A – Eligibility Member Life Insurance ...,0.310993,15
9,"{'Chunk_No.': 2, 'Page_No.': 'Page 50'}",d. Effective Dates and Premium Waiver Coverage...,0.311734,187


#### <font color = red>  Re-Ranking with a Cross Encoder

In [64]:
# Import the CrossEncoder library from sentence_transformers

from sentence_transformers import CrossEncoder, util

In [65]:
# Initialise the cross encoder model

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [66]:
# Test the cross encoder model

scores = cross_encoder.predict([['Does the insurance cover diabetic patients?', 'The insurance policy covers some pre-existing conditions including diabetes, heart diseases, etc. The policy does not howev'],
                                ['Does the insurance cover diabetic patients?', 'The premium rates for various age groups are given as follows. Age group (<18 years): Premium rate']])

In [67]:
scores

array([  3.8467607, -11.252879 ], dtype=float32)

In [68]:
def apply_cross_encoder(query, df):
  cross_inputs = [[query, response] for response in df['Documents']]
  cross_rerank_scores = cross_encoder.predict(cross_inputs)
  df['Reranked_scores'] = cross_rerank_scores
  return df

In [69]:
# Apply Cross Encoder
cache_results_df = apply_cross_encoder(query, cache_results_df)
cache_results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Chunk_No.': 0, 'Page_No.': 'Page 49'}",Payment of benefits will be subject to the Ben...,0.251205,181,3.35009
1,"{'Chunk_No.': 3, 'Page_No.': 'Page 50'}",Premium will not be charged for Member Life an...,0.253982,188,2.619668
2,"{'Chunk_No.': 0, 'Page_No.': 'Page 38'}",Section D - Continuation Article 1 - Member Li...,0.290602,137,0.716006
3,"{'Chunk_No.': 2, 'Page_No.': 'Page 43'}",The Dependent will not be required to submit P...,0.291126,155,0.366664
4,"{'Chunk_No.': 1, 'Page_No.': 'Page 43'}",(2) If termination is as described in b. (2) a...,0.297987,154,-1.409757
5,"{'Chunk_No.': 0, 'Page_No.': 'Page 8'}",Section A - Member Life Insurance Schedule of ...,0.300341,20,-0.500916
6,"{'Chunk_No.': 2, 'Page_No.': 'Page 38'}",If a Member qualifies for continuation under m...,0.302779,139,-0.572141
7,"{'Chunk_No.': 0, 'Page_No.': 'Page 51'}",Coverage During Disability will cease on the e...,0.308696,192,-2.410477
8,"{'Chunk_No.': 0, 'Page_No.': 'Page 7'}",Section A – Eligibility Member Life Insurance ...,0.310993,15,-1.904589
9,"{'Chunk_No.': 2, 'Page_No.': 'Page 50'}",d. Effective Dates and Premium Waiver Coverage...,0.311734,187,-1.656612


In [70]:
# Function to return top 3 results after reranking

def get_topn(n, df):
  top_3_rerank = df.sort_values(by='Reranked_scores', ascending=False)
  return top_3_rerank[["Documents", "Metadatas"]][:n]

In [71]:
top_3_RAG = get_topn(3, cache_results_df)


In [72]:
top_3_RAG

Unnamed: 0,Documents,Metadatas
0,Payment of benefits will be subject to the Ben...,"{'Chunk_No.': 0, 'Page_No.': 'Page 49'}"
1,Premium will not be charged for Member Life an...,"{'Chunk_No.': 3, 'Page_No.': 'Page 50'}"
2,Section D - Continuation Article 1 - Member Li...,"{'Chunk_No.': 0, 'Page_No.': 'Page 38'}"


### <font color = red> Approach 3:
**Chunking:** Paragraph Based

**`Embedding Model:`** text-embedding-ada-002

#### <font color = red> Chunking and Embedding

In [73]:
# Delete previous collections
client.delete_collection(name="RAG_on_Insurance")
client.delete_collection(name="Insurance_Cache")

# Optionally, clear all collections in ChromaDB
for collection_name in client.list_collections():
    client.delete_collection(name=collection_name)

# Reset variables
documents_list = []
metadata_list = []

In [74]:
# Function to split text into paragraphs chunks
def split_by_paragraph(text):
    """Splits text into chunks based on paragraphs."""
    return [para.strip() for para in text.split("\n\n") if para.strip()]

In [75]:
def process_pages(insurance_pdf_data):
    """Processes all pages, chunking them intelligently, and handling short pages."""
    all_chunks = []

    # Iterate through all pages
    for idx, row in insurance_pdf_data.iterrows():
        page_no = row["Page No."]
        page_text = row["Page_Text"]
        metadata = row["Metadata"]

        # Get chunks for the page
        text_chunks = split_by_paragraph(page_text)

        # Merge small pages with the next one if possible
        if len(text_chunks) == 1 and idx < len(insurance_pdf_data) - 1:
            next_page_text = insurance_pdf_data.iloc[idx + 1]["Page_Text"]
            merged_text = page_text + " " + next_page_text  # Combine pages
            text_chunks = split_by_paragraph(merged_text)

        # Create structured data for each chunk
        for i, chunk in enumerate(text_chunks):
            all_chunks.append({
                "Title": page_no,
                "Chunk Text": chunk,
                "Metadata": {**metadata, "Chunk_No.": i}
            })

    return pd.DataFrame(all_chunks)

In [76]:
# Process pages
paragraph_chunk_df = process_pages(insurance_pdf_data)
paragraph_chunk_df.head()

Unnamed: 0,Title,Chunk Text,Metadata
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,"{'Page_No.': 'Page 1', 'Chunk_No.': 0}"
1,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,"{'Page_No.': 'Page 3', 'Chunk_No.': 0}"
2,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,"{'Page_No.': 'Page 5', 'Chunk_No.': 0}"
3,Page 6,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,"{'Page_No.': 'Page 6', 'Chunk_No.': 0}"
4,Page 7,Section A – Eligibility Member Life Insurance ...,"{'Page_No.': 'Page 7', 'Chunk_No.': 0}"


In [77]:
# Set up the embedding function using the OpenAI embedding model

model = "text-embedding-ada-002"
embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)

In [78]:
# Initialise a collection in chroma and pass the embedding_function to it so that it used OpenAI embeddings to embed the documents

insurance_collection = client.get_or_create_collection(name='RAG_on_Insurance', embedding_function=embedding_function)

In [79]:
# Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma

documents_list = paragraph_chunk_df["Chunk Text"].tolist()
metadata_list = paragraph_chunk_df['Metadata'].tolist()

In [80]:
# Add the documents and metadata to the collection alongwith generic integer IDs. You can also feed the metadata information as IDs by combining the policy name and page no.
insurance_collection.add(
    documents= documents_list,
    ids = [str(i) for i in range(0, len(documents_list))],
    metadatas = metadata_list
)

In [81]:
# Let's take a look at the first few entries in the collection
insurance_collection.get(
    ids = ['0','1','2'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0', '1', '2'],
 'embeddings': array([[-1.33525385e-02,  1.73981488e-02, -1.51693486e-02, ...,
         -2.17071623e-02, -1.15830067e-03,  3.78558319e-03],
        [-8.50887224e-03,  1.22145675e-02,  2.68231030e-04, ...,
         -1.04057547e-02,  7.34364474e-03, -9.99250551e-05],
        [-1.73468161e-02,  1.96003318e-02,  2.24532536e-03, ...,
         -2.23779175e-02, -1.33180059e-02,  2.67441012e-03]]),
 'documents': ['DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/01/2014 711 HIGH STREET GEORGE RI 02903 GROUP POLICY FOR: RHODE ISLAND JOHN DOE ALL MEMBERS Group Member Life Insurance Print Date: 07/16/2014 POLICY RIDER GROUP INSURANCE POLICY NO: S655 COVERAGE: Life EMPLOYER: RHODE ISLAND JOHN DOE Effective on the later of the Date of Issue of this Group Policy or March 1, 2005, the following will apply to your Policy: From time to time The Principal may offer or provide certain employer groups who apply for coverage with The Principal a Financial Services Hotline and Grief Su

In [82]:
cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)

In [83]:
cache_collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

#### <font color = red> Semantic Search with Cache
In this section, we will perform a semantic search of a query in the collections embeddings to get several top semantically similar results.

In [84]:
# Read the user query

query = 'What is the life insurance coverage for disability?'

In [85]:
# Searh the Cache collection first
# Query the collection against the user query and best result

cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

In [86]:
cache_results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'data': None,
 'metadatas': [[]],
 'distances': [[]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [87]:
results = insurance_collection.query(
query_texts=query,
n_results=10
)
print("Result size is : " + str(len(results.items())))
results.items()

Result size is : 8


dict_items([('ids', [['46', '35', '48', '5', '47', '43', '4', '44', '53', '51']]), ('embeddings', None), ('documents', [["Payment of benefits will be subject to the Beneficiary and Facility of Payment provisions of this PART IV, Section A. Article 6 - Member Life Insurance - Coverage During Disability A Member may be eligible to continue his or her Member Life and Member Accidental Death and Dismemberment Insurance and Dependent Life Insurance coverage during the Member's ADL Disability or Total Disability. a. Coverage Qualification To be qualified for Coverage During Disability, a Member must: (1) become ADL Disabled or Totally Disabled while insured for Member Life Insurance; and (2) become ADL Disabled or Totally Disabled prior to the attainment of age 60; and (3) remain ADL Disabled or Totally Disabled continuously; and (4) be under the regular care and attendance of a Physician; and (5) send proof of ADL Disability or Total Disability to The Principal when required; and (6) submit

In [88]:

def search(query):

  # Set a threshold for cache search
  threshold = 0.2

  ids = []
  documents = []
  distances = []
  metadatas = []
  results_df = pd.DataFrame()

  # try to find from cache
  cache_results = cache_collection.query(
      query_texts=query,
      n_results=1
  )

  # If the distance is greater than the threshold, then return the results from the main collection.
  if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
        # Query the collection against the user query and return the top 10 results
        results = insurance_collection.query(
        query_texts=query,
        n_results=10
        )

        # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
        # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
        Keys = []
        Values = []

        size = len(results.items())

        for key, val in results.items():
          if val is None:
            continue
          for i in range(size):
            Keys.append(str(key)+str(i))
            Values.append(str(val[0][i]))


        cache_collection.add(
            documents= [query],
            ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
            metadatas = dict(zip(Keys, Values))
        )

        #print("Not found in cache. Found in main collection.")

        result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
        results_df = pd.DataFrame.from_dict(result_dict)
        return results_df


  # If the distance is, however, less than the threshold, you can return the results from cache

  elif cache_results['distances'][0][0] <= threshold:
        cache_result_dict = cache_results['metadatas'][0][0]

        # Loop through each inner list and then through the dictionary
        for key, value in cache_result_dict.items():
            if 'ids' in key:
                ids.append(value)
            elif 'documents' in key:
                documents.append(value)
            elif 'distances' in key:
                distances.append(value)
            elif 'metadatas' in key:
                metadatas.append(value)

        #print("Found in cache!")

        # Create a DataFrame
        return pd.DataFrame({
          'IDs': ids,
          'Documents': documents,
          'Distances': distances,
          'Metadatas': metadatas
        })


In [89]:
cache_results_df = search(query)
cache_results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs
0,"{'Chunk_No.': 0, 'Page_No.': 'Page 49'}",Payment of benefits will be subject to the Ben...,0.268416,46
1,"{'Chunk_No.': 0, 'Page_No.': 'Page 38'}",Section D - Continuation Article 1 - Member Li...,0.288877,35
2,"{'Chunk_No.': 0, 'Page_No.': 'Page 51'}",Coverage During Disability will cease on the e...,0.293669,48
3,"{'Chunk_No.': 0, 'Page_No.': 'Page 8'}",Section A - Member Life Insurance Schedule of ...,0.307591,5
4,"{'Chunk_No.': 0, 'Page_No.': 'Page 50'}",The Principal may require that a ADL Disabled ...,0.308653,47
5,"{'Chunk_No.': 0, 'Page_No.': 'Page 46'}",PART IV - BENEFITS Section A - Member Life Ins...,0.309539,43
6,"{'Chunk_No.': 0, 'Page_No.': 'Page 7'}",Section A – Eligibility Member Life Insurance ...,0.331267,4
7,"{'Chunk_No.': 0, 'Page_No.': 'Page 47'}","M ember's death, the Death Benefits Payable ma...",0.331802,44
8,"{'Chunk_No.': 0, 'Page_No.': 'Page 56'}","If a Member sustains an injury, and as a resul...",0.34136,53
9,"{'Chunk_No.': 0, 'Page_No.': 'Page 54'}","f . claim requirements listed in PART IV, Sect...",0.341982,51


#### <font color = red>  Re-Ranking with a Cross Encoder

In [90]:
# Import the CrossEncoder library from sentence_transformers

from sentence_transformers import CrossEncoder, util

In [91]:
# Initialise the cross encoder model

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [92]:
# Test the cross encoder model

scores = cross_encoder.predict([['Does the insurance cover diabetic patients?', 'The insurance policy covers some pre-existing conditions including diabetes, heart diseases, etc. The policy does not howev'],
                                ['Does the insurance cover diabetic patients?', 'The premium rates for various age groups are given as follows. Age group (<18 years): Premium rate']])

In [93]:
scores

array([  3.8467607, -11.252879 ], dtype=float32)

In [94]:
def apply_cross_encoder(query, df):
  cross_inputs = [[query, response] for response in df['Documents']]
  cross_rerank_scores = cross_encoder.predict(cross_inputs)
  df['Reranked_scores'] = cross_rerank_scores
  return df

In [95]:
# Apply Cross Encoder
cache_results_df = apply_cross_encoder(query, cache_results_df)
cache_results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Chunk_No.': 0, 'Page_No.': 'Page 49'}",Payment of benefits will be subject to the Ben...,0.268416,46,1.616178
1,"{'Chunk_No.': 0, 'Page_No.': 'Page 38'}",Section D - Continuation Article 1 - Member Li...,0.288877,35,0.167269
2,"{'Chunk_No.': 0, 'Page_No.': 'Page 51'}",Coverage During Disability will cease on the e...,0.293669,48,-1.497242
3,"{'Chunk_No.': 0, 'Page_No.': 'Page 8'}",Section A - Member Life Insurance Schedule of ...,0.307591,5,-1.504811
4,"{'Chunk_No.': 0, 'Page_No.': 'Page 50'}",The Principal may require that a ADL Disabled ...,0.308653,47,-1.73845
5,"{'Chunk_No.': 0, 'Page_No.': 'Page 46'}",PART IV - BENEFITS Section A - Member Life Ins...,0.309539,43,-0.025323
6,"{'Chunk_No.': 0, 'Page_No.': 'Page 7'}",Section A – Eligibility Member Life Insurance ...,0.331267,4,-2.980693
7,"{'Chunk_No.': 0, 'Page_No.': 'Page 47'}","M ember's death, the Death Benefits Payable ma...",0.331802,44,-7.231838
8,"{'Chunk_No.': 0, 'Page_No.': 'Page 56'}","If a Member sustains an injury, and as a resul...",0.34136,53,-6.30383
9,"{'Chunk_No.': 0, 'Page_No.': 'Page 54'}","f . claim requirements listed in PART IV, Sect...",0.341982,51,-5.840656


In [96]:
# Function to return top 3 results after reranking

def get_topn(n, df):
  top_3_rerank = df.sort_values(by='Reranked_scores', ascending=False)
  return top_3_rerank[["Documents", "Metadatas"]][:n]

In [97]:
top_3_RAG = get_topn(3, cache_results_df)


In [98]:
top_3_RAG

Unnamed: 0,Documents,Metadatas
0,Payment of benefits will be subject to the Ben...,"{'Chunk_No.': 0, 'Page_No.': 'Page 49'}"
1,Section D - Continuation Article 1 - Member Li...,"{'Chunk_No.': 0, 'Page_No.': 'Page 38'}"
5,PART IV - BENEFITS Section A - Member Life Ins...,"{'Chunk_No.': 0, 'Page_No.': 'Page 46'}"


### <font color = red> Approach 4:
**Chunking:** Semantics Based

**`Embedding Model:`** text-embedding-ada-002

#### <font color = red> Chunking and Embedding

In [99]:
# Delete previous collections
client.delete_collection(name="RAG_on_Insurance")
client.delete_collection(name="Insurance_Cache")

# Optionally, clear all collections in ChromaDB
for collection_name in client.list_collections():
    client.delete_collection(name=collection_name)

# Reset variables
documents_list = []
metadata_list = []

In [100]:
# Function to split text into semantic chunks
def split_by_semantic(text, n_clusters=5):
    """Splits text into semantic chunks using KMeans clustering."""
    sentences = [sent.text for sent in nlp(text).sents]
    embeddings = np.array([nlp(sent).vector for sent in sentences])

    if len(sentences) > n_clusters:
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        labels = kmeans.fit_predict(embeddings)

        clustered_chunks = {i: [] for i in range(n_clusters)}
        for idx, label in enumerate(labels):
            clustered_chunks[label].append(sentences[idx])

        return [" ".join(clustered_chunks[i]) for i in range(n_clusters)]
    else:
        return [" ".join(sentences)]

In [101]:
def process_pages(insurance_pdf_data):
    """Processes all pages, chunking them intelligently, and handling short pages."""
    all_chunks = []

    # Iterate through all pages
    for idx, row in insurance_pdf_data.iterrows():
        page_no = row["Page No."]
        page_text = row["Page_Text"]
        metadata = row["Metadata"]

        # Get chunks for the page
        text_chunks = split_by_semantic(page_text)

        # Merge small pages with the next one if possible
        if len(text_chunks) == 1 and idx < len(insurance_pdf_data) - 1:
            next_page_text = insurance_pdf_data.iloc[idx + 1]["Page_Text"]
            merged_text = page_text + " " + next_page_text  # Combine pages
            text_chunks = split_by_semantic(merged_text)

        # Create structured data for each chunk
        for i, chunk in enumerate(text_chunks):
            all_chunks.append({
                "Title": page_no,
                "Chunk Text": chunk,
                "Metadata": {**metadata, "Chunk_No.": i}
            })

    return pd.DataFrame(all_chunks)

In [102]:
# Process pages
semantic_chunk_df = process_pages(insurance_pdf_data)
semantic_chunk_df.head()

Unnamed: 0,Title,Chunk Text,Metadata
0,Page 1,ALL MEMBERS Group Member Life Insurance Print ...,"{'Page_No.': 'Page 1', 'Chunk_No.': 0}"
1,Page 1,Effective on the later of the Date of Issue of...,"{'Page_No.': 'Page 1', 'Chunk_No.': 1}"
2,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,"{'Page_No.': 'Page 1', 'Chunk_No.': 2}"
3,Page 1,EXCEPT AS SPECIFICALLY DESCRIBED IN THIS RIDER...,"{'Page_No.': 'Page 1', 'Chunk_No.': 3}"
4,Page 1,RHODE ISLAND JOHN DOE,"{'Page_No.': 'Page 1', 'Chunk_No.': 4}"


In [103]:
# Set up the embedding function using the OpenAI embedding model

model = "text-embedding-ada-002"
embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)

In [104]:
# Initialise a collection in chroma and pass the embedding_function to it so that it used OpenAI embeddings to embed the documents

insurance_collection = client.get_or_create_collection(name='RAG_on_Insurance', embedding_function=embedding_function)

In [105]:
# Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma

documents_list = semantic_chunk_df["Chunk Text"].tolist()
metadata_list = semantic_chunk_df['Metadata'].tolist()

In [106]:
# Add the documents and metadata to the collection alongwith generic integer IDs. You can also feed the metadata information as IDs by combining the policy name and page no.
insurance_collection.add(
    documents= documents_list,
    ids = [str(i) for i in range(0, len(documents_list))],
    metadatas = metadata_list
)

In [107]:
# Let's take a look at the first few entries in the collection
insurance_collection.get(
    ids = ['0','1','2'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0', '1', '2'],
 'embeddings': array([[-2.24272590e-02,  7.31233216e-04, -4.84769745e-03, ...,
         -2.13618316e-02,  2.83170794e-03,  2.68021744e-04],
        [-6.65525673e-03,  9.92765464e-03,  6.10295311e-03, ...,
          3.65901040e-03, -3.21716769e-03,  6.38169411e-04],
        [-1.33487480e-02,  1.36058349e-02, -1.42782163e-02, ...,
         -3.34081240e-02, -6.68755779e-03, -5.71647761e-05]]),
 'documents': ['ALL MEMBERS Group Member Life Insurance Print Date: 07/16/2014 POLICY RIDER GROUP INSURANCE POLICY NO: S655 COVERAGE: Life EMPLOYER:',
  'Effective on the later of the Date of Issue of this Group Policy or March 1, 2005, the following will apply to your Policy: From time to time The Principal may offer or provide certain employer groups who apply for coverage with The Principal a Financial Services Hotline and Grief Support Services or any other value added service for the employees of that employer group. In addition, The Principal may arrange for third part

In [108]:
cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)

In [109]:
cache_collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

#### <font color = red> Semantic Search with Cache
In this section, we will perform a semantic search of a query in the collections embeddings to get several top semantically similar results.

In [110]:
# Read the user query

query = 'What is the life insurance coverage for disability?'

In [111]:
# Searh the Cache collection first
# Query the collection against the user query and best result

cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

In [112]:
cache_results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'data': None,
 'metadatas': [[]],
 'distances': [[]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [113]:
results = insurance_collection.query(
query_texts=query,
n_results=10
)
print("Result size is : " + str(len(results.items())))
results.items()

Result size is : 8


dict_items([('ids', [['225', '169', '230', '224', '235', '29', '171', '23', '93', '227']]), ('embeddings', None), ('documents', [["Payment of benefits will be subject to the Beneficiary and Facility of Payment provisions of this PART IV, Section A. Article 6 - Member Life Insurance - Coverage During Disability A Member may be eligible to continue his or her Member Life and Member Accidental Death and Dismemberment Insurance and Dependent Life Insurance coverage during the Member's ADL Disability or Total Disability. Proof of ADL Disability or Total Disability Written proof of ADL Disability or Total Disability must be sent to The Principal within one year of the date ADL Disability or Total Disability begins.", 'Section D - Continuation Article 1 - Member Life Insurance a. Sickness or Injury (Other Than ADL Disability or Total Disability)', "After ADL Disability or Total Disability has continued two years from the date first proof of ADL Disability or Total Disability is received, exam

In [114]:

def search(query):

  # Set a threshold for cache search
  threshold = 0.2

  ids = []
  documents = []
  distances = []
  metadatas = []
  results_df = pd.DataFrame()

  # try to find from cache
  cache_results = cache_collection.query(
      query_texts=query,
      n_results=1
  )

  # If the distance is greater than the threshold, then return the results from the main collection.
  if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
        # Query the collection against the user query and return the top 10 results
        results = insurance_collection.query(
        query_texts=query,
        n_results=10
        )

        # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
        # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
        Keys = []
        Values = []

        size = len(results.items())

        for key, val in results.items():
          if val is None:
            continue
          for i in range(size):
            Keys.append(str(key)+str(i))
            Values.append(str(val[0][i]))


        cache_collection.add(
            documents= [query],
            ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
            metadatas = dict(zip(Keys, Values))
        )

        #print("Not found in cache. Found in main collection.")

        result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
        results_df = pd.DataFrame.from_dict(result_dict)
        return results_df


  # If the distance is, however, less than the threshold, you can return the results from cache

  elif cache_results['distances'][0][0] <= threshold:
        cache_result_dict = cache_results['metadatas'][0][0]

        # Loop through each inner list and then through the dictionary
        for key, value in cache_result_dict.items():
            if 'ids' in key:
                ids.append(value)
            elif 'documents' in key:
                documents.append(value)
            elif 'distances' in key:
                distances.append(value)
            elif 'metadatas' in key:
                metadatas.append(value)

        #print("Found in cache!")

        # Create a DataFrame
        return pd.DataFrame({
          'IDs': ids,
          'Documents': documents,
          'Distances': distances,
          'Metadatas': metadatas
        })


In [115]:
cache_results_df = search(query)
cache_results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs
0,"{'Chunk_No.': 3, 'Page_No.': 'Page 49'}",Payment of benefits will be subject to the Ben...,0.251154,225
1,"{'Chunk_No.': 2, 'Page_No.': 'Page 38'}",Section D - Continuation Article 1 - Member Li...,0.277428,169
2,"{'Chunk_No.': 3, 'Page_No.': 'Page 50'}",After ADL Disability or Total Disability has c...,0.278113,230
3,"{'Chunk_No.': 2, 'Page_No.': 'Page 49'}",a. Coverage Qualification To be qualified for ...,0.281015,224
4,"{'Chunk_No.': 3, 'Page_No.': 'Page 51'}",Coverage During Disability will cease on the e...,0.288639,235
5,"{'Chunk_No.': 4, 'Page_No.': 'Page 8'}",Section A - Member Life Insurance Schedule of ...,0.303325,29
6,"{'Chunk_No.': 4, 'Page_No.': 'Page 38'}",Article 2 - Dependent Insurance - Developmenta...,0.304696,171
7,"{'Chunk_No.': 3, 'Page_No.': 'Page 7'}",Section A – Eligibility Member Life Insurance ...,0.311051,23
8,"{'Chunk_No.': 3, 'Page_No.': 'Page 21'}",c. Dependent Life Insurance,0.311441,93
9,"{'Chunk_No.': 0, 'Page_No.': 'Page 50'}",The Principal may require that a ADL Disabled ...,0.316999,227


#### <font color = red>  Re-Ranking with a Cross Encoder

In [116]:
# Import the CrossEncoder library from sentence_transformers

from sentence_transformers import CrossEncoder, util

In [117]:
# Initialise the cross encoder model

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [118]:
# Test the cross encoder model

scores = cross_encoder.predict([['Does the insurance cover diabetic patients?', 'The insurance policy covers some pre-existing conditions including diabetes, heart diseases, etc. The policy does not howev'],
                                ['Does the insurance cover diabetic patients?', 'The premium rates for various age groups are given as follows. Age group (<18 years): Premium rate']])

In [119]:
scores

array([  3.8467607, -11.252879 ], dtype=float32)

In [120]:
def apply_cross_encoder(query, df):
  cross_inputs = [[query, response] for response in df['Documents']]
  cross_rerank_scores = cross_encoder.predict(cross_inputs)
  df['Reranked_scores'] = cross_rerank_scores
  return df

In [121]:
# Apply Cross Encoder
cache_results_df = apply_cross_encoder(query, cache_results_df)
cache_results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Chunk_No.': 3, 'Page_No.': 'Page 49'}",Payment of benefits will be subject to the Ben...,0.251154,225,4.128472
1,"{'Chunk_No.': 2, 'Page_No.': 'Page 38'}",Section D - Continuation Article 1 - Member Li...,0.277428,169,2.199855
2,"{'Chunk_No.': 3, 'Page_No.': 'Page 50'}",After ADL Disability or Total Disability has c...,0.278113,230,1.79793
3,"{'Chunk_No.': 2, 'Page_No.': 'Page 49'}",a. Coverage Qualification To be qualified for ...,0.281015,224,1.509498
4,"{'Chunk_No.': 3, 'Page_No.': 'Page 51'}",Coverage During Disability will cease on the e...,0.288639,235,0.433305
5,"{'Chunk_No.': 4, 'Page_No.': 'Page 8'}",Section A - Member Life Insurance Schedule of ...,0.303325,29,-0.182894
6,"{'Chunk_No.': 4, 'Page_No.': 'Page 38'}",Article 2 - Dependent Insurance - Developmenta...,0.304696,171,1.303159
7,"{'Chunk_No.': 3, 'Page_No.': 'Page 7'}",Section A – Eligibility Member Life Insurance ...,0.311051,23,-1.904588
8,"{'Chunk_No.': 3, 'Page_No.': 'Page 21'}",c. Dependent Life Insurance,0.311441,93,-2.345396
9,"{'Chunk_No.': 0, 'Page_No.': 'Page 50'}",The Principal may require that a ADL Disabled ...,0.316999,227,-3.351392


In [122]:
# Function to return top 3 results after reranking

def get_topn(n, df):
  top_3_rerank = df.sort_values(by='Reranked_scores', ascending=False)
  return top_3_rerank[["Documents", "Metadatas"]][:n]

In [123]:
top_3_RAG = get_topn(3, cache_results_df)


In [124]:
top_3_RAG

Unnamed: 0,Documents,Metadatas
0,Payment of benefits will be subject to the Ben...,"{'Chunk_No.': 3, 'Page_No.': 'Page 49'}"
1,Section D - Continuation Article 1 - Member Li...,"{'Chunk_No.': 2, 'Page_No.': 'Page 38'}"
2,After ADL Disability or Total Disability has c...,"{'Chunk_No.': 3, 'Page_No.': 'Page 50'}"


## <font color = red> Retrieval Augmented Generation

In [125]:
# Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model

def generate_response(query, top_3_RAG):
    """
    Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
    """
    messages = [
                {"role": "system", "content":  "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
                {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
                                                You have a question asked by the user in '{query}' and you have some search results from an insurance documents in the dataframe '{top_3_RAG}'. These search results are essentially one page of an insurance document that may be relevant to the user query.

                                                The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.

                                                Use the documents in '{top_3_RAG}' to answer the query '{query}'. Frame an informative answer and also, use the dataframe to return the relevant page numbers as citations.

                                                Follow the guidelines below when performing the task.
                                                1. Try to provide relevant/accurate numbers if available.
                                                2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
                                                3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular in format.
                                                3. Use the Metadatas columns in the dataframe to retrieve page numbers(s) as citation.
                                                4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
                                                5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.

                                                The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.
                                                Example:
                                                **Answer:**
                                                The accidental death insurance policy covers the following benefits:
                                                1. Lump-sum payment to the beneficiary in the event of the insured person's accidental death.
                                                2. Additional benefits may include coverage for accidental dismemberment, where a partial payment is made based on the loss suffered.

                                                **Citations:**
                                                1. Page 49: Payment of benefits will be subject to the Benefit Maximum.

                                                **Answer:**
                                                The life insurance coverage for disability includes benefits such as Payment of benefits subject to the Benefit Maximum, Continuing Coverage in case of Member Life, and benefits after ADL or Total Disability has occurred.

                                                **Citations:**
                                                1. Page 49: Payment of benefits will be subject to the Benefit Maximum.
                                                2. Page 38: Section D - Continuation Article 1 - Member Life.
                                                3. Page 50: Benefits after ADL Disability or Total Disability has occurred.
                                                """},
              ]

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages
    )

    return response.choices[0].message.content.split('\n')

In [126]:
# Generate the response

response = generate_response(query, top_3_RAG)

In [127]:
# Print the response

print("\n".join(response))

**Answer:**  
The life insurance coverage for disability includes the following benefits:
1. Payment of benefits subject to the Benefit Maximum.
2. Continuing Coverage in case of Member Life.
3. Benefits provided after ADL Disability or Total Disability has occurred.

**Citations:**  
1. Page 49: Payment of benefits will be subject to the Benefit Maximum.
2. Page 38: Section D - Continuation Article 1 - Member Life.  
3. Page 50: Benefits after ADL Disability or Total Disability has occurred.


## <font color = red> Queries

In [128]:
queries = [
    "What are the conditions for death coverage if the insured was not wearing a seat belt?",
    "What benefits are covered under accidental death insurance?",
    "How can I reinstate my insurance after it was terminated?"
]

for query in queries:
    print(f"\n\033[1m\033[34mQuery: {query}\033[0m\n" + "-" * 50)

    # Step 1: Retrieve chunks from ChromaDB
    df = search(query)
    print("\n\033[1m\033[94mTop 3 Retrieved Chunks (Search Layer):\033[0m")
    print(df.head(3))  # Display the top 3 retrieved results

    # Step 2: Apply cross-encoder for re-ranking
    df = apply_cross_encoder(query, df)

    # Step 3: Select top 3 final results
    df = get_topn(3, df)

    # Step 4: Generate response using LLM
    response = generate_response(query, df)

    print("\n\033[1m\033[94mFinal RAG-generated Response (Generation Layer):\033[0m")
    print("\n".join(response))

    print("=" * 50)  # Separator for better readability


[1m[34mQuery: What are the conditions for death coverage if the insured was not wearing a seat belt?[0m
--------------------------------------------------

[1m[94mTop 3 Retrieved Chunks (Search Layer):[0m
                                 Metadatas  \
0  {'Chunk_No.': 2, 'Page_No.': 'Page 55'}   
1  {'Chunk_No.': 0, 'Page_No.': 'Page 55'}   
2  {'Chunk_No.': 1, 'Page_No.': 'Page 54'}   

                                           Documents  Distances  IDs  
0  Exposure Exposure to the elements will be pres...   0.271287  254  
1  the position of the Seat Belt is certified in ...   0.355079  252  
2  claim requirements listed in PART IV, Section ...   0.371645  248  

[1m[94mFinal RAG-generated Response (Generation Layer):[0m
**Answer:**  
The conditions for death coverage if the insured was not wearing a seat belt are as follows:
- If the insured was not wearing a seat belt at the time of the incident, the death coverage benefits may be impacted or reduced based on the insuran