## Communicating with own knowledge base using `OpenAI`

In [1]:
import credentials
import time
import re
import shutil
import glob
import os
os.environ["OPENAI_API_KEY"] = credentials.openai_api

import openai
import tiktoken

from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index import StorageContext, load_index_from_storage

# for low-level API calls
from llama_index import VectorStoreIndex, ResponseSynthesizer
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import SimilarityPostprocessor

# langchain helpers

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, Language
from langchain.document_loaders import PyPDFDirectoryLoader, PyPDFLoader, UnstructuredHTMLLoader

### VectorDB from documents

In [2]:
path = '../docs/docs_to_index/'
docs = []

#### USING PDF DIR LOADER - CANNOT SET SPLITTING ####
#loader = PyPDFDirectoryLoader(path)
#docs = loader.load()


#### USING SIMPLE PDF LOADER IN LOOP - SETTING SPLITTING ####
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 200)
list_of_pdf_files = glob.glob(path + "*.pdf")

for pdf_file in list_of_pdf_files:
    pdf_loaded = PyPDFLoader(pdf_file).load_and_split(text_splitter = text_splitter)
    docs.extend(pdf_loaded)


#### LOAD & SPLIT HTML CONTENT ####
html_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.MARKDOWN, chunk_size=1500, chunk_overlap=200)
list_of_html_files = glob.glob(path + "*.html")

for html_file in list_of_html_files:
    html_loaded = UnstructuredHTMLLoader(html_file).load_and_split(text_splitter = html_splitter)
    docs.extend(html_loaded)


In [13]:
# used to be only 206 docs (206 total pages)
# now with the smaller chunks we have more docs but similarity search shall be more relevant
len(docs)

649

In [10]:
# trick to help with context
# add to each page the document name

for i in docs:
    i.page_content = i.metadata['source'].split("\\")[-1].split('.')[0] + ' --- ' + i.page_content

In [32]:
### to handle heterogenity of datasources, need to append page information to non-PDF and doc files

for i in docs:
    if 'page' not in i.metadata.keys():
        i.metadata['page'] = 'ALL'

Apply OpenAI Embedder

In [33]:
embeddings = OpenAIEmbeddings()
persist_dir = "../docs/docs_to_index_vectorDB/"
if os.path.exists(persist_dir): 
    shutil.rmtree(persist_dir)
    os.makedirs(persist_dir)

db = FAISS.from_documents(docs, embeddings)
db.save_local(persist_dir)

#db = Chroma.from_documents(docs, embeddings, persist_directory=persist_dir)
#db.persist()

In [34]:
db.index.ntotal

649

In [35]:
#query = 'Are dogs allowed on WizzAir?'
query = 'What is Booking.com cancellation policy?'

query_embedded = embeddings.embed_query(query)

### as chuck size is smaller, we can include more relevant results in the PROMPT

#sim_docs = db.similarity_search_with_score(query, k = 5) #score the lower the better for FAISS (L2)
#sim_docs = db.similarity_search_with_score_by_vector(query_embedded, k = 5)
sim_docs = db.max_marginal_relevance_search_by_vector(query_embedded, k = 5)

#sim_docs = db.similarity_search_with_score(query) # for some reason even for cos_sim the lower here the better (probably inverted for minimization)
#sim_docs = db.max_marginal_relevance_search_by_vector(query_embedded)

sim_docs[:2]

[Document(page_content="Booking --- 3. Some Bookings can’t be canceled for free, while others can only be canceled for free before a deadline.\n\n4. If you book a Travel Experience by paying in advance (including all price components and/or a damage deposit if applicable), the Service Provider may cancel the Booking without notice if they can't collect the balance on the date specified. If they do, any non-refundable payment you’ve made will only be refunded at their discretion. It's your responsibility to make sure the payment goes through on time, that your bank, debit card, or credit card details are correct, and that there's enough money available in your account.\n\n5. If you think you won’t arrive on time, contact your Service Provider and tell them when they can expect you so they don't cancel your Booking. If you’re late, we are not liable for the consequences (e.g. the cancellation of your Booking or any fees the Service Provider may charge).\n\n6. As the person making the Boo

### Construct prompt with sources

#### ChatGPT code - packed into function

In [23]:
history = []

def Ask_Your_Knowledge_Base(user_input = "What is RyanAir's cancellation policy?",
                            model = 'gpt-3.5-turbo', #can either be 3.5-turbo or 4
                            db = db,
                            embeddings = embeddings,
                            history_list = history):
    

    ### This function does not have memory, meaning follow-up capabilities are not yet implemented
    ### Every question is a 'new' context


    #### PROMPT PARTS ####

    main_system_message = """
    Assistant helps people answer questions about a wide variety of documents. 
    Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question. 
    Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf].

    Sources:
    {sources}

    """

    turn_prefix = """

    user:
    """

    turn_suffix = """

    assistant:
    """

    prompt_history = turn_prefix


    #### PROCESS QUERY; RUN SIMILARITY SEARCH ####

    query_embedded = embeddings.embed_query(user_input)
    search = user_input

    print("Searching:", search)
    print("-------------------")

    sim_docs = db.max_marginal_relevance_search_by_vector(query_embedded, k = 5)
    results = [doc.metadata['source'].split("\\")[-1] + "-page-" + str(doc.metadata['page'] )+ ": " + doc.page_content.replace("\n", "").replace("\r", "") for doc in sim_docs]
    content = "\n".join(results)


    #### CRAFT MESSAGE TO CHATCOMPLETION ####

    message = [{"role": "system", "content": main_system_message.format(sources=content)},
               {'role' : 'user', 'content' : prompt_history + user_input + turn_suffix}]
    

    #### CALL CHATGPT API ####

    completion = openai.ChatCompletion.create(
        model=model, 
        messages=message, 
        temperature=0.0, 
        max_tokens=512,)
    

    #### APPEND TO HISTORY ####

    prompt_history += user_input + turn_suffix + completion.choices[0]['message']['content'] + turn_prefix

    history_list.append("user: " + user_input)
    history_list.append("assistant: " + completion.choices[0]['message']['content'])

    print("\n-------------------\n".join(history_list))   

In [24]:
Ask_Your_Knowledge_Base()

Searching: What is RyanAir's cancellation policy?
-------------------
user: What is RyanAir's cancellation policy?
-------------------
assistant: According to the General Terms & Conditions of Carriage listed on Ryanair's website [ryanair_conditions.pdf-page-16], if Ryanair cancels a flight, fails to operate the flight according to schedule, or cancels the route, passengers may be entitled to the rights set out in the Montreal Convention 1999 or the applicable Passenger Rights Regulations. If a flight is cancelled or delayed by two hours or more, Ryanair will give passengers information on their rights, including their rights to compensation and assistance. If a passenger cancels their booking up to 14 days prior to the scheduled time of departure of their flight, they will be entitled to a refund of the total fare after deduction of the cancellation fee. If a passenger cancels their booking within 14 days prior to the scheduled time of departure of their flight, they will be refunded 

In [25]:
Ask_Your_Knowledge_Base('Are dogs allowed on WizzAir?')

Searching: Are dogs allowed on WizzAir?
-------------------
user: What is RyanAir's cancellation policy?
-------------------
assistant: According to the General Terms & Conditions of Carriage listed on Ryanair's website [ryanair_conditions.pdf-page-16], if Ryanair cancels a flight, fails to operate the flight according to schedule, or cancels the route, passengers may be entitled to the rights set out in the Montreal Convention 1999 or the applicable Passenger Rights Regulations. If a flight is cancelled or delayed by two hours or more, Ryanair will give passengers information on their rights, including their rights to compensation and assistance. If a passenger cancels their booking up to 14 days prior to the scheduled time of departure of their flight, they will be entitled to a refund of the total fare after deduction of the cancellation fee. If a passenger cancels their booking within 14 days prior to the scheduled time of departure of their flight, they will be refunded the amount

In [26]:
Ask_Your_Knowledge_Base("What is Easy Jet's refund policy?")

Searching: What is Easy Jet's refund policy?
-------------------
user: What is RyanAir's cancellation policy?
-------------------
assistant: According to the General Terms & Conditions of Carriage listed on Ryanair's website [ryanair_conditions.pdf-page-16], if Ryanair cancels a flight, fails to operate the flight according to schedule, or cancels the route, passengers may be entitled to the rights set out in the Montreal Convention 1999 or the applicable Passenger Rights Regulations. If a flight is cancelled or delayed by two hours or more, Ryanair will give passengers information on their rights, including their rights to compensation and assistance. If a passenger cancels their booking up to 14 days prior to the scheduled time of departure of their flight, they will be entitled to a refund of the total fare after deduction of the cancellation fee. If a passenger cancels their booking within 14 days prior to the scheduled time of departure of their flight, they will be refunded the a

In [27]:
Ask_Your_Knowledge_Base('Can I carry a gun on RyanAir?')

Searching: Can I carry a gun on RyanAir?
-------------------
user: What is RyanAir's cancellation policy?
-------------------
assistant: According to the General Terms & Conditions of Carriage listed on Ryanair's website [ryanair_conditions.pdf-page-16], if Ryanair cancels a flight, fails to operate the flight according to schedule, or cancels the route, passengers may be entitled to the rights set out in the Montreal Convention 1999 or the applicable Passenger Rights Regulations. If a flight is cancelled or delayed by two hours or more, Ryanair will give passengers information on their rights, including their rights to compensation and assistance. If a passenger cancels their booking up to 14 days prior to the scheduled time of departure of their flight, they will be entitled to a refund of the total fare after deduction of the cancellation fee. If a passenger cancels their booking within 14 days prior to the scheduled time of departure of their flight, they will be refunded the amoun

In [36]:
Ask_Your_Knowledge_Base('How far in advance can I cancel on AirBNB?')

Searching: How far in advance can I cancel on AirBNB?
-------------------
user: What is RyanAir's cancellation policy?
-------------------
assistant: According to the General Terms & Conditions of Carriage listed on Ryanair's website [ryanair_conditions.pdf-page-16], if Ryanair cancels a flight, fails to operate the flight according to schedule, or cancels the route, passengers may be entitled to the rights set out in the Montreal Convention 1999 or the applicable Passenger Rights Regulations. If a flight is cancelled or delayed by two hours or more, Ryanair will give passengers information on their rights, including their rights to compensation and assistance. If a passenger cancels their booking up to 14 days prior to the scheduled time of departure of their flight, they will be entitled to a refund of the total fare after deduction of the cancellation fee. If a passenger cancels their booking within 14 days prior to the scheduled time of departure of their flight, they will be refun

In [37]:
Ask_Your_Knowledge_Base('If I cannot travel, does Booking.com offer refunds?')

Searching: If I cannot travel, does Booking.com offer refunds?
-------------------
user: What is RyanAir's cancellation policy?
-------------------
assistant: According to the General Terms & Conditions of Carriage listed on Ryanair's website [ryanair_conditions.pdf-page-16], if Ryanair cancels a flight, fails to operate the flight according to schedule, or cancels the route, passengers may be entitled to the rights set out in the Montreal Convention 1999 or the applicable Passenger Rights Regulations. If a flight is cancelled or delayed by two hours or more, Ryanair will give passengers information on their rights, including their rights to compensation and assistance. If a passenger cancels their booking up to 14 days prior to the scheduled time of departure of their flight, they will be entitled to a refund of the total fare after deduction of the cancellation fee. If a passenger cancels their booking within 14 days prior to the scheduled time of departure of their flight, they will

In [141]:
#print('\n'.join([i.page_content for i in docs if 'easyjet_conditions.pdf' in i.metadata['source'] and i.metadata['page'] in ([5, 6])]))

#### Old - davinci code

In [77]:
# Davinci uses a particular set of tokens to indicate turns in conversations
prompt_prefix = """<|im_start|>system
Assistant helps people answer questions about a wide variety of documents. 
Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question. 
Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf].

Sources:
{sources}

<|im_end|>"""

turn_prefix = """
<|im_start|>user
"""

turn_suffix = """
<|im_end|>
<|im_start|>assistant
"""

prompt_history = turn_prefix

history = []

summary_prompt_template = """Below is a summary of the conversation so far, and a new question asked by the user that needs to be answered by searching in a knowledge base. 
Generate a search query based on the conversation and the new question. 
Do not include source names, file names in the search query.
Pay more attention to the new question, rather than the summary.

Summary:
{summary}

Question:
{question}

Search query:
"""

In [80]:
# Execute this cell multiple times updating user_input to accumulate chat history
user_input = "What is RyanAir's cancellation policy?"
user_input = "What is WizzAir's cancellation policy?"
user_input = "What is EasyJet's cancellation policy?"

#embed user input for relevancy search
query_embedded = embeddings.embed_query(user_input)


# Exclude category, to simulate scenarios where there's a set of docs you can't see
#exclude_category = None

#if len(history) > 0:
#    completion = openai.Completion.create(
#        engine='text-davinci-003',
#        prompt=summary_prompt_template.format(summary="\n".join(history), question=user_input),
#        temperature=0.7,
#        max_tokens=32,
#        stop=["\n"])
#    search = completion.choices[0].text
#else:
#    search = user_input

search = user_input

# Alternatively simply use search_client.search(q, top=3) if not using semantic search
print("Searching:", search)
print("-------------------")


sim_docs = db.max_marginal_relevance_search_by_vector(query_embedded, k = 3)

results = [doc.metadata['source'].split("\\")[-1] + "-page-" + str(doc.metadata['page'] )+ ": " + doc.page_content.replace("\n", "").replace("\r", "") for doc in sim_docs]
content = "\n".join(results)

prompt = prompt_prefix.format(sources=content) + prompt_history + user_input + turn_suffix

completion = openai.Completion.create(
    engine='text-davinci-003', 
    prompt=prompt, 
    temperature=0.0, 
    max_tokens=1024,
    stop=["<|im_end|>", "<|im_start|>"])

prompt_history += user_input + turn_suffix + completion.choices[0].text + "\n<|im_end|>" + turn_prefix
history.append("user: " + user_input)
history.append("assistant: " + completion.choices[0].text)

print("\n-------------------\n".join(history))
print("\n-------------------\nPrompt:\n" + prompt)

Searching: What is EasyJet's cancellation policy?
-------------------
user: What is RyanAir's cancellation policy?
-------------------
assistant: According to [ryanair_conditions.pdf-page-16], if your flight is cancelled or delayed by two hours or more, RyanAir will give you information on your rights, including your rights to compensation and assistance. If we do not let you board the plane for a flight you have booked (as set out in the booking confirmation or itinerary) because a space is no longer available, we will compensate you in line with any relevant law that applies. We will give you information about your rights in these circumstances, particularly your rights relating to practical help and compensation. If an immediate family member who is not travelling with you dies within 28 days of your booked flight, you may claim a refund equal to the fare you paid for the particular flight (or flights) you don't take as a result, plus any associated taxes, fees and charges that you 

### LlamaIndex on documents

In [None]:
# load docs and build index

documents = SimpleDirectoryReader('../docs/docs_to_index/').load_data()
index = VectorStoreIndex.from_documents(documents)

In [None]:
# persist on computer
index.storage_context.persist(persist_dir="../docs/docs_to_index_vectorDB/")

# rebuild storage context
#storage_context = StorageContext.from_defaults(persist_dir="<persist_dir>")

# load index
#index = load_index_from_storage(storage_context)

In [None]:
# check nodes

parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(documents)

In [None]:
# query index

query_engine = index.as_query_engine()
response = query_engine.query("Who are company representatives?")

In [None]:
response.extra_info

{'e8e24678-1ea5-45e7-a108-2b822e220ad2': {'page_label': '7',
  'file_name': 'Law_Insider_eidos-therapeutics-inc_contract_Filed_23-03-2018_Contract.pdf'},
 'a7c216ee-a8ce-421f-8464-9d2d181b4045': {'page_label': '8',
  'file_name': 'Law_Insider_eidos-therapeutics-inc_contract_Filed_23-03-2018_Contract.pdf'}}

In [None]:
print(response.get_formatted_sources())

> Source (Doc id: e8e24678-1ea5-45e7-a108-2b822e220ad2): page_label: 7
file_name: Law_Insider_eidos-therapeutics-inc_contract_Filed_23-03-2018_Contract.pd...

> Source (Doc id: a7c216ee-a8ce-421f-8464-9d2d181b4045): page_label: 8
file_name: Law_Insider_eidos-therapeutics-inc_contract_Filed_23-03-2018_Contract.pd...


In [None]:
[i.score for i in response.source_nodes]

[0.7528456148084678, 0.7514728258301441]

In [None]:
print(response.response)


Company representatives are Christine Siu, Chief Financial Officer of Eidos Therapeutics, Inc., and Neil Kumar, Chief Executive Officer of BridgeBio Pharma LLC.


Low level API

In [None]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=3,
    retriever_mode = 'embedding' # default
)

# configure response synthesizer
response_synthesizer = ResponseSynthesizer.from_args(
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.6)
    ]
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

# query
response = query_engine.query("Who are company representatives?")
print(response)


Company representatives are Christine Siu (Chief Financial Officer) and Neil Kumar (Chief Executive Officer).


In [None]:
[i.score for i in response.source_nodes]

[0.7528456148084678, 0.7514728258301441, 0.7402034007180792]

In [None]:
retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=3,
    retriever_mode = 'embedding' # default
)