In [1]:
###transformer_name = 'sentence-transformers/all-MiniLM-L6-v2'
embedding_model_name = "sentence-transformers/all-mpnet-base-v2" #open source?
llm_name = "databricks/dolly-v2-2-8b"

KB_doc_folder = "KnowledgeBaseDocs"
vector_db_path = "AIProjectDB/vector_db"

text_chunk_size_nchar = 500

In [2]:
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip -q install accelerate>=0.12.0
!pip install langchain
!pip install transformers
!pip install sentence-transformers
!pip install unstructured
!pip install chromadb==0.3.22 transformers==4.29.0

Collecting transformers==4.29.0
  Using cached transformers-4.29.0-py3-none-any.whl (7.1 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.32.0.dev0
    Uninstalling transformers-4.32.0.dev0:
      Successfully uninstalled transformers-4.32.0.dev0
Successfully installed transformers-4.29.0


In [3]:
import os
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings

###model = SentenceTransformer(transformer_name)
embed_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

In [4]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os.path
from langchain.vectorstores import Chroma

i=1
#KB_doc_folder

completetextlist = []
# Don't recompute the embeddings if they're already available
if not os.path.isdir(vector_db_path) or len(os.listdir(vector_db_path)) == 0:
    if not os.path.isdir(vector_db_path):
      print(f"creating folder {vector_db_path}")
      !mkdir -p {vector_db_path}
    
    for filename in os.listdir(KB_doc_folder):
        with open(os.path.join(KB_doc_folder, filename)) as f:
            path = KB_doc_folder + "/" + filename
            loader = UnstructuredPDFLoader(path) #creating multiple times?
            data = loader.load()
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=text_chunk_size_nchar,chunk_overlap=0)
            texts = text_splitter.split_documents(data)
            completetextlist = completetextlist + texts
            
            print(f"Saving document embeddings under {vector_db_path}")
            db = Chroma.from_documents(collection_name="public_docs", documents=texts, embedding=embed_model, persist_directory=vector_db_path)
            db.persist()

In [5]:
db = Chroma(collection_name="public_docs", embedding_function=embed_model, persist_directory=vector_db_path)

def get_similar_docs(question, similar_doc_count):
  return db.similarity_search(question, k=similar_doc_count)

# Let's test it:
sim_doc_count = 1
for doc in get_similar_docs("What is the most cost effective dental plan?", sim_doc_count):
  print(doc)
  print("\n*****************************New Chunk:")

Using embedded DuckDB with persistence: data will be stored in: AIProjectDB/vector_db


page_content='“Affordable dental and vision plans?”\n\n2016 Dental & Vision Plans for Individuals & Families\n\nHORIZON DENTAL\n\nYour sight and smile are important parts of your total health.\n\nThat’s why Horizon Blue Cross Blue Shield of New Jersey offers affordable vision and dental plans, so you can get the total coverage and savings you need for yourself or your whole family.' metadata={'source': 'KnowledgeBaseDocs/HorizonDentalandVisionPlans.pdf'}

*****************************New Chunk:


In [6]:
for doc in get_similar_docs("What is the waiting period before vision claims are paid?", 3):
  print(doc)
  print("\n*****************************New Chunk:")

page_content='There is a 7-day waiting period after your “effective date” (the date your coverage begins) before vision claims will be paid. All Horizon Vision plans can be purchased with a medical plan or by themselves. You must have a primary residence in New Jersey and be age 19 or older.\n\nLearn more about Horizon Vision plans and enroll online at HorizonBlue.com/Plans.\n\nHORIZON DENTAL & VISION\n\nCall 1-844-826-5528 to learn more or enroll online at HorizonBlue.com/Plans.\n\nGET ANSWERS & ENROLL' metadata={'source': 'KnowledgeBaseDocs/HorizonDentalandVisionPlans.pdf'}

*****************************New Chunk:
page_content='Horizon Dental plans do not have waiting periods for diagnostic or preventive coverage. However, some dental plans have waiting periods for certain coverage and procedures. For example, if you enroll in Horizon Healthy Smiles, you would have to wait 6 months before amalgam (silver) fillings would be covered. See the dental plan guide for details.\n\nE F P T O 

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from langchain import PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain

def build_qa_chain():
  torch.cuda.empty_cache()
  model_name = "databricks/dolly-v2-2-8b" 

  instruct_pipeline = pipeline(model=model_name, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto", return_full_text=True, max_new_tokens=256, top_p=0.95, top_k=50)

  # Defining our prompt content.langchain will load our similar documents as {context}
  template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
  Instruction:
  You are a customer service representative at a health insurance company and your job is to help providing the best accurate answer to a customer. 
  Use only information in the following paragraphs to answer the question at the end. Explain the answer with reference to these paragraphs. If you don't know, say that you do not know.
 
  {context}
 
  Question: {question}
 
  Response:
  """
  prompt = PromptTemplate(input_variables=['context', 'question'], template=template)
 
  hf_pipe = HuggingFacePipeline(pipeline=instruct_pipeline)
  return load_qa_chain(llm=hf_pipe, chain_type="stuff", prompt=prompt, verbose=True)

qa_chain = build_qa_chain()

In [8]:
def session():
    plan = input("What plan does the customer have? Enter BR for braven, MED for medicare, GEN for general, or ALL for everything.\n")
    if plan.upper()=="BR" or plan.upper()=="BRAVEN":
        #search from braven documentation
        print("braven")
    elif plan.upper()=="MED" or plan.upper()=="MEDICARE":
        #search from medicare documentation
        print("medicare")
    elif plan.upper()=="GEN" or plan.upper()=="GENERAL":
        #search everything/general
        print("general")
    elif plan.upper()=="ALL":
        #search all plans
        print("all")
    else:
        print("Sorry, I'm not sure what you're saying")

In [9]:
def continueprompt():
    while True:
        ret = input("Would you like to ask a related question? Enter Y for yes and N for no.")
        if ret.upper()=="Y" or ret.upper()=="YES":
            return True
        elif ret.upper()=="N" or ret.upper()=="NO":
            return False
        else:
            print("I'm sorry, I didn't understand. Would you like to ask a related question? Enter Y for yes and N for no.")

In [10]:
def answer_question(question):
    session()
    similar_docs = get_similar_docs(question, similar_doc_count=2)
    result = qa_chain({"input_documents": similar_docs, "question": question})
    #result_html = f"<p><blockquote style=\"font-size:24\">{question}</blockquote></p>"
    #result_html += f"<p><blockquote style=\"font-size:18px\">{result['output_text']}</blockquote></p>"
    #result_html += "<p><hr/></p>"
    for d in result["input_documents"]:
        source_id = d.metadata["source"]
    print(source_id)
    #  result_html += f"<p><blockquote>{d.page_content}<br/>(Source: <a href=\"https://gardening.stackexchange.com/a/{source_id}\">{source_id}</a>)</blockquote></p>"
    #displayHTML(result_html)
    res = question + result['output_text']
    print(res)
    if continueprompt()==True:
        newq = input("Please enter your next question.")
        answer_question(newq)

In [None]:
answer_question("What is the criteria to buy a Horizon Dental insurance plan?")

In [None]:
answer_question("What is a phone number for dental plans?")

In [None]:
answer_question("Is there a dental plan for kids?")

In [None]:
answer_question("Will I owe a copay if I get an MRI?")

In [None]:
answer_question("Will I owe a copay for lab services?")

In [None]:
answer_question("My doctor recommended physical therapy. Would it be covered?")

In [None]:
answer_question("I am travelling out of the country and would like to know if I have worldwide coverage with Braven.")