In [None]:
!pip install pdfplumber
!pip install tiktoken
!pip install openai
!pip install chromaDB
!pip install sentence-transformers
!pip install PyPDF2


In [2]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
import openai
import chromadb
import re

In [3]:
# Open the PDF file
with pdfplumber.open("/content/Principal-Sample-Life-Insurance-Policy.pdf") as pdf:

    # Get one of the pages from the PDF and examine it
    single_page = pdf.pages[6]

    # Extract text from the first page
    text = single_page.extract_text()

    # Extract tables from the first page
    tables = single_page.extract_tables()

    # Print the extracted text
    print(text)

Section A – Eligibility
Member Life Insurance Article 1
Member Accidental Death and Dismemberment Insurance Article 2
Dependent Life Insurance Article 3
Section B - Effective Dates
Member Life Insurance Article 1
Member Accidental Death and Dismemberment Insurance Article 2
Dependent Life Insurance Article 3
Section C - Individual Terminations
Member Life Insurance Article 1
Member Accidental Death and Dismemberment Insurance Article 2
Dependent Life Insurance Article 3
Termination for Fraud Article 4
Coverage While Outside of the United States Article 5
Section D - Continuation
Member Life Insurance Article 1
Dependent Insurance - Developmentally Disabled or
Physically Handicapped Children Article 2
Section E - Reinstatement
Reinstatement Article 1
Federal Required Family and Medical Leave Act (FMLA) Article 2
Reinstatement of Coverage for a Member or Dependent When
Coverage Ends due to Living Outside of the United States Article 3
Section F - Individual Purchase Rights
Member Life In

In [4]:
pdf_path='/content/Principal-Sample-Life-Insurance-Policy.pdf'

In [6]:
extracted_text=extract_text_from_pdf(pdf_path)

In [None]:
def extract_pdf_sections_to_df(pdf_file):
    sections = []
    current_section = None
    section_number = 0

    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            lines = text.split('\n')

            for line in lines:
                if re.match(r'^PART [IVXLCDM]+ - .+$', line):
                    if current_section:
                        sections.append(current_section)
                    section_number += 1
                    current_section = {"section_no": section_number, "section_name": line, "content": []}
                elif current_section:
                    current_section["content"].append(line)

    if current_section:
        sections.append(current_section)

    # Create a DataFrame
    df = pd.DataFrame(sections)

    # Post-process the section content for "PART IV - BENEFITS"
    for i, section in df.iterrows():
        if "PART IV - BENEFITS" in section['section_name']:
            content = "\n".join(section['content'])
            subsections = re.split(r'(,?This policy has been updated effective [^,]+,)', content)
            section_name = "PART IV - BENEFITS"
            for j, subsection in enumerate(subsections):
                if subsection.strip():
                    df = df.append({"section_no": len(df) + j + 1, "section_name": section_name, "content": subsection}, ignore_index=True)
            df.drop(index=i, inplace=True)

    return df


pdf_path = '/content/Principal-Sample-Life-Insurance-Policy.pdf'
df = extract_pdf_sections_to_df(pdf_path)




In [6]:
df.head()

Unnamed: 0,section_no,section_name,content
0,1,PART I - DEFINITIONS,[]
1,2,PART II - POLICY ADMINISTRATION,"[Section A – Contract, Entire Contract Article..."
2,3,PART III - INDIVIDUAL REQUIREMENTS AND RIGHTS,[This policy has been updated effective Januar...
3,5,PART II - POLICY ADMINISTRATION,"[Section A - Contract, Article 1 - Entire Cont..."
4,6,PART II - POLICY ADMINISTRATION,"[GC 6003 Section A - Contract, Page 1, a. be a..."


In [7]:
df['Document Name']=pdf_path.split('/')[2]

In [8]:
# Store the metadata for each page in a separate column

df['Metadata'] = df.apply(lambda x: {'Policy_Name': x['Document Name'][:-4], 'Section_No.': x['section_no']}, axis=1)

In [9]:
df['Metadata'][1]

{'Policy_Name': 'Principal-Sample-Life-Insurance-Policy', 'Section_No.': 2}

In [10]:
with open("/content/openapi_key.txt", "r") as f:
  openai.api_key = ' '.join(f.readlines())

In [11]:
# Import the OpenAI Embedding Function into chroma

from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [12]:
# Define the path where chroma collections will be stored

chroma_data_path = '/content/'

In [13]:
# Call PersistentClient()

client = chromadb.PersistentClient(path=chroma_data_path)

In [14]:
# Set up the embedding function using the OpenAI embedding model

model = "text-embedding-ada-002"
embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)

In [15]:
# Initialise a collection in chroma and pass the embedding_function to it so that it used OpenAI embeddings to embed the documents

policy_collection = client.get_or_create_collection(name='RAG_on_Policy', embedding_function=embedding_function)

In [16]:
# Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma

documents_list = df["content"].tolist()
metadata_list = df['Metadata'].tolist()

In [17]:
# Ensure all elements in documents_list are text strings
documents_list = [str(document) for document in documents_list]

# Add the text content of the documents to the collection
policy_collection.add(
    documents=documents_list,
    ids=[str(i) for i in range(len(documents_list))],  # Use integer IDs
    metadatas=metadata_list
)


In [112]:
results_df

Unnamed: 0,IDs,Documents,Distances,Metadatas,Reranked_scores
0,4,"['GC 6003 Section A - Contract, Page 1', 'a. b...",0.2918824553489685,{'Policy_Name': 'Principal-Sample-Life-Insuran...,3.7065
1,9,"['GC 6004 Section B - Premiums, Page 2', 'The ...",0.3092226386070251,{'Policy_Name': 'Principal-Sample-Life-Insuran...,-3.816668
2,8,"['GC 6004 Section B - Premiums, Page 1', 'b . ...",0.3197951316833496,{'Policy_Name': 'Principal-Sample-Life-Insuran...,0.486126
3,74,"GC 6013 Section A - Member Life Insurance, Pag...",0.3464842140674591,{'Policy_Name': 'Principal-Sample-Life-Insuran...,-3.257565
4,71,"GC 6013 Section A - Member Life Insurance, Pag...",0.3507596850395202,{'Policy_Name': 'Principal-Sample-Life-Insuran...,-1.551533
5,77,GC 6015 Section B - Member Accidental Death an...,0.3532353639602661,{'Policy_Name': 'Principal-Sample-Life-Insuran...,-2.070832
6,53,Section A - Member Life Insurance\nArticle 1 -...,0.3536171913146972,{'Policy_Name': 'Principal-Sample-Life-Insuran...,-3.336583
7,92,GC 6015 Section B - Member Accidental Death an...,0.3635528683662414,{'Policy_Name': 'Principal-Sample-Life-Insuran...,-4.79327
8,7,"['GC 6003 Section A - Contract, Page 4', 'Sect...",0.3641311526298523,{'Policy_Name': 'Principal-Sample-Life-Insuran...,-3.427002
9,56,"GC 6013 Section A - Member Life Insurance, Pag...",0.3712063431739807,{'Policy_Name': 'Principal-Sample-Life-Insuran...,-6.442597


In [18]:
# Let's take a look at the first few entries in the collection

policy_collection.get(
    ids = ['0','1','2'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0', '1', '2'],
 'embeddings': [[-0.020535271614789963,
   -0.018955636769533157,
   -0.005249145440757275,
   -0.016481338068842888,
   -0.03187230974435806,
   0.028125913813710213,
   -0.02668606862425804,
   -0.041489917784929276,
   -0.02566559612751007,
   -0.0419931635260582,
   0.015586677007377148,
   0.01223868690431118,
   -0.01250428892672062,
   -0.01961265318095684,
   -0.005388936493545771,
   0.009988054633140564,
   0.029132407158613205,
   -0.007569673005491495,
   0.02491072565317154,
   -0.012581174261868,
   -0.017110396176576614,
   0.01680285669863224,
   -0.006521242205053568,
   -0.020772917196154594,
   -0.018382493406534195,
   0.0048752049915492535,
   0.00814281590282917,
   -0.02943994849920273,
   -0.010561197064816952,
   -0.015418928116559982,
   0.015293116681277752,
   -0.0014048977755010128,
   -0.0067763603292405605,
   -0.025106431916356087,
   -0.02073097974061966,
   -0.008107868023216724,
   -0.0090444665402174,
   -0.004959079436957836

In [19]:
cache_collection = client.get_or_create_collection(name='Policy_Cache', embedding_function=embedding_function)

In [20]:
cache_collection.peek()

{'ids': [], 'embeddings': [], 'metadatas': [], 'documents': []}

In [159]:
# Read the user query

query = input()

What happens if a beneficiary is found guilty of the Member's death regarding benefit payment?


In [160]:
# Searh the Cache collection first
# Query the collection against the user query and return the top 20 results

cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

In [161]:
# Implementing Cache in Semantic Search

# Set a threshold for cache search
threshold = 0.2

ids = []
documents = []
distances = []
metadatas = []
results_df = pd.DataFrame()


# If the distance is greater than the threshold, then return the results from the main collection.

if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
      # Query the collection against the user query and return the top 10 results
      results = policy_collection.query(
      query_texts=query,
      n_results=10
      )

      # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
      # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
      Keys = []
      Values = []

      for key, val in results.items():
        if key != 'embeddings':
          for i in range(10):
            Keys.append(str(key)+str(i))
            Values.append(str(val[0][i]))


      cache_collection.add(
          documents= [query],
          ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
          metadatas = dict(zip(Keys, Values))
      )

      print("Not found in cache. Found in main collection.")

      result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
      results_df = pd.DataFrame.from_dict(result_dict)
      results_df


# If the distance is, however, less than the threshold, you can return the results from cache

elif cache_results['distances'][0][0] <= threshold:
      cache_result_dict = cache_results['metadatas'][0][0]

      # Loop through each inner list and then through the dictionary
      for key, value in cache_result_dict.items():
          if 'ids' in key:
              ids.append(value)
          elif 'documents' in key:
              documents.append(value)
          elif 'distances' in key:
              distances.append(value)
          elif 'metadatas' in key:
              metadatas.append(value)

      print("Found in cache!")

      # Create a DataFrame
      results_df = pd.DataFrame({
        'IDs': ids,
        'Documents': documents,
        'Distances': distances,
        'Metadatas': metadatas
      })


Not found in cache. Found in main collection.


In [162]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs
0,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6013 Section A - Member Life Insurance, Pag...",0.254876,56
1,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6013 Section A - Member Life Insurance, Pag...",0.273604,59
2,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6016 Section C - Dependent Life Insurance, ...",0.273691,95
3,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"['GC 6013 Section A - Member Life Insurance, P...",0.288896,36
4,{'Policy_Name': 'Principal-Sample-Life-Insuran...,GC 6015 Section B - Member Accidental Death an...,0.2947,77
5,{'Policy_Name': 'Principal-Sample-Life-Insuran...,['GC 6016 Section C - Dependent Life Insurance...,0.302508,42
6,{'Policy_Name': 'Principal-Sample-Life-Insuran...,GC 6015 Section B - Member Accidental Death an...,0.309415,86
7,{'Policy_Name': 'Principal-Sample-Life-Insuran...,['GC 6015 Section B - Member Accidental Death ...,0.326645,39
8,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6013 Section A - Member Life Insurance, Pag...",0.33051,71
9,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6013 Section A - Member Life Insurance, Pag...",0.340633,68


In [25]:
# Import the CrossEncoder library from sentence_transformers

from sentence_transformers import CrossEncoder, util

In [26]:
# Initialise the cross encoder model

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

Downloading (…)lve/main/config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [27]:
# Test the cross encoder model

scores = cross_encoder.predict([['What does "Date of Issue" refer to?', 'For the purposes of this Group Policy, the term "spouse" will include Civil Union,Partner, except as otherwise provided in this Group Policy.,Date of Issue,The date this Group Policy is placed in force: November 1, 2007'],
                                ['What does "Date of Issue" refer to?', "Dependent,a. A Member's spouse, if that spouse:,(1) is legally married to the Member;"]])

In [28]:
scores

array([  0.37614805, -10.741692  ], dtype=float32)

In [163]:
# Input (query, response) pairs for each of the top 20 responses received from the semantic search to the cross encoder
# Generate the cross_encoder scores for these pairs

cross_inputs = [[query, response] for response in results_df['Documents']]
cross_rerank_scores = cross_encoder.predict(cross_inputs)

In [164]:
cross_rerank_scores

array([ 3.985478 ,  3.1822052,  4.6619864,  3.021525 ,  1.3928044,
        2.4622703, -2.215438 ,  0.9788705, -1.6525159, -5.8169374],
      dtype=float32)

In [165]:
# Store the rerank_scores in results_df

results_df['Reranked_scores'] = cross_rerank_scores

In [166]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6013 Section A - Member Life Insurance, Pag...",0.254876,56,3.985478
1,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6013 Section A - Member Life Insurance, Pag...",0.273604,59,3.182205
2,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6016 Section C - Dependent Life Insurance, ...",0.273691,95,4.661986
3,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"['GC 6013 Section A - Member Life Insurance, P...",0.288896,36,3.021525
4,{'Policy_Name': 'Principal-Sample-Life-Insuran...,GC 6015 Section B - Member Accidental Death an...,0.2947,77,1.392804
5,{'Policy_Name': 'Principal-Sample-Life-Insuran...,['GC 6016 Section C - Dependent Life Insurance...,0.302508,42,2.46227
6,{'Policy_Name': 'Principal-Sample-Life-Insuran...,GC 6015 Section B - Member Accidental Death an...,0.309415,86,-2.215438
7,{'Policy_Name': 'Principal-Sample-Life-Insuran...,['GC 6015 Section B - Member Accidental Death ...,0.326645,39,0.978871
8,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6013 Section A - Member Life Insurance, Pag...",0.33051,71,-1.652516
9,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6013 Section A - Member Life Insurance, Pag...",0.340633,68,-5.816937


In [167]:
# Return the top 3 results from semantic search

top_3_semantic = results_df.sort_values(by='Distances')
top_3_semantic[:3]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6013 Section A - Member Life Insurance, Pag...",0.254876,56,3.985478
1,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6013 Section A - Member Life Insurance, Pag...",0.273604,59,3.182205
2,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6016 Section C - Dependent Life Insurance, ...",0.273691,95,4.661986


In [168]:
# Return the top 3 results after reranking

top_3_rerank = results_df.sort_values(by='Reranked_scores', ascending=False)
top_3_rerank[:3]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
2,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6016 Section C - Dependent Life Insurance, ...",0.273691,95,4.661986
0,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6013 Section A - Member Life Insurance, Pag...",0.254876,56,3.985478
1,{'Policy_Name': 'Principal-Sample-Life-Insuran...,"GC 6013 Section A - Member Life Insurance, Pag...",0.273604,59,3.182205


In [169]:
top_3_RAG = top_3_rerank[["Documents", "Metadatas"]][:3]

In [170]:
top_3_RAG

Unnamed: 0,Documents,Metadatas
2,"GC 6016 Section C - Dependent Life Insurance, ...",{'Policy_Name': 'Principal-Sample-Life-Insuran...
0,"GC 6013 Section A - Member Life Insurance, Pag...",{'Policy_Name': 'Principal-Sample-Life-Insuran...
1,"GC 6013 Section A - Member Life Insurance, Pag...",{'Policy_Name': 'Principal-Sample-Life-Insuran...


In [None]:
# Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model

def generate_response(query, results_df):
    """
    Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
    """
    messages = [
                {"role": "system", "content":  "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
                {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
                                                You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_3_RAG}'. These search results are essentially sections of the insurance document that is relevant to the user query.

                                                The column 'Documents' inside this dataframe contains the actual text from the policy document and the column 'Metadatas' contains the policy name and source section no.

                                                Use the Documents in '{top_3_RAG}' to answer the query '{query}'. Frame an informative answer from the 1st row in Documents column in {top_3_RAG} based on query and also, return the relevant policy names and section numbers as citations.

                                                Follow the guidelines below when performing the task.
                                                1. Provide relevant numbers or answers.
                                                2. Use all the information from the Documents column in {top_3_RAG} and then provide the shortlisted and concise answer based on {query}.
                                                3. Use the Metadatas columns in the dataframe to retrieve and cite the policy name(s) and section numbers(s) as citation.
                                                4. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.

                                                The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text, it should have the answer to the query from the documents in {top_3_RAG} along with the citation. Provide your complete response first with all information, and then provide the citations.

                                                """},
              ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages
    )

    return response['choices'][0]['message']['content'].split('\n')

In [131]:
def generate_response(query, results_df):
    """
    Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
    """
    # Combine relevant information from the results_df with the user's query.
    documents = top_3_RAG['Documents'][:2].tolist()  # Extract the content of the Documents column
    prompt = f"You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents. Use the documents to answer the query '{query}'."

    # Prepare the messages input for the model.
    messages = [
        {"role": "system", "content": "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": "\n".join(documents)}  # Include the documents in the conversation
    ]

    # Generate a response from the model.
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=150  # You can adjust the max_tokens to limit the response length
    )

    return response['choices'][0]['message']['content'].split('\n')


In [171]:
# Generate the response

response = generate_response(query, top_3_RAG)

In [172]:
response="\n".join(response)

In [173]:
response

"If a beneficiary is found guilty of the member's death, they may be disqualified from receiving any benefit due. According to GC 6016 Section C - Dependent Life Insurance and GC 6013 Section A - Member Life Insurance, if the member is suspected or charged with the dependent's death, the death benefits payable may be withheld until additional information has been received or the trial has been held. If the member is found guilty, they may be disqualified from receiving any benefit due, and payment may then be made to any contingent beneficiary or to the executor or administrator of the dependent's estate."