## Communicating with own knowledge base using `OpenAI`

In [1]:
import credentials
import time
import re
from copy import deepcopy
import shutil
import glob
import os
os.environ["OPENAI_API_KEY"] = credentials.openai_api

import openai
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
from langchain.document_loaders import PyPDFLoader, UnstructuredHTMLLoader

from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document

### VectorDB from documents

In [2]:
path = '../docs/'
docs = []

#### USING PDF DIR LOADER - CANNOT SET SPLITTING ####
#loader = PyPDFDirectoryLoader(path)
#docs = loader.load()


#### USING SIMPLE PDF LOADER IN LOOP - SETTING SPLITTING ####
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 200)
list_of_pdf_files = glob.glob(path + "*.pdf")

for pdf_file in list_of_pdf_files:
    pdf_loaded = PyPDFLoader(pdf_file).load_and_split(text_splitter = text_splitter)
    docs.extend(pdf_loaded)


#### LOAD & SPLIT HTML CONTENT ####
html_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.MARKDOWN, chunk_size=1500, chunk_overlap=200)
list_of_html_files = glob.glob(path + "*.html")

for html_file in list_of_html_files:
    html_loaded = UnstructuredHTMLLoader(html_file).load_and_split(text_splitter = html_splitter)
    docs.extend(html_loaded)


In [3]:
len(docs)

649

In [4]:
docs_original = deepcopy(docs)

In [5]:
# trick to help with context
# add to each page the document name

for i in docs:
    i.page_content = i.metadata['source'].split("\\")[-1].split('.')[0] + ' --- ' + i.page_content

In [6]:
### to handle heterogenity of datasources, need to append page information to non-PDF and doc files

for i in docs:
    if 'page' not in i.metadata.keys():
        i.metadata['page'] = 'ALL'

Apply OpenAI Embedder

In [140]:
embeddings = OpenAIEmbeddings()
persist_dir = "../docs/persist_dir/"
if os.path.exists(persist_dir): 
    shutil.rmtree(persist_dir)
    os.makedirs(persist_dir)

db = FAISS.from_documents(docs, embeddings)
db.save_local(persist_dir)

#db = Chroma.from_documents(docs, embeddings, persist_directory=persist_dir)
#db.persist()

In [141]:
db.index.ntotal

649

In [142]:
#query = 'Are dogs allowed on WizzAir?'
query = 'What is Booking.com cancellation policy?'

query_embedded = embeddings.embed_query(query)

### as chuck size is smaller, we can include more relevant results in the PROMPT

#sim_docs = db.similarity_search_with_score(query, k = 5) #score the lower the better for FAISS (L2)
#sim_docs = db.similarity_search_with_score_by_vector(query_embedded, k = 5)
sim_docs = db.max_marginal_relevance_search_by_vector(query_embedded, k = 5)

#sim_docs = db.similarity_search_with_score(query) # for some reason even for cos_sim the lower here the better (probably inverted for minimization)
#sim_docs = db.max_marginal_relevance_search_by_vector(query_embedded)

sim_docs[:2]

[Document(page_content="Booking --- 3. Some Bookings can’t be canceled for free, while others can only be canceled for free before a deadline.\n\n4. If you book a Travel Experience by paying in advance (including all price components and/or a damage deposit if applicable), the Service Provider may cancel the Booking without notice if they can't collect the balance on the date specified. If they do, any non-refundable payment you’ve made will only be refunded at their discretion. It's your responsibility to make sure the payment goes through on time, that your bank, debit card, or credit card details are correct, and that there's enough money available in your account.\n\n5. If you think you won’t arrive on time, contact your Service Provider and tell them when they can expect you so they don't cancel your Booking. If you’re late, we are not liable for the consequences (e.g. the cancellation of your Booking or any fees the Service Provider may charge).\n\n6. As the person making the Boo

### Construct prompt with sources

In [7]:
def ChatOAI(user_input = "Add two numbers and return their 4th log",
            model = 'gpt-3.5-turbo'):
    
    PROMPT = """
    Act as en experienced senior Python software engineer and data scientist.
    Your job is to write a function given a user's instruction.
    Include basic documentation and comments to explain the steps.
    If needed, provide parameter type and range expectations.
    Your function should look like it was written by a senior data scientist.

    Instruction: {instruction}

    Answer:
    
    """
    
    message = [{'role' : 'user', 'content' : PROMPT.format(instruction = user_input)}]
    
    completion = openai.ChatCompletion.create(
        model=model, 
        messages=message, 
        temperature=0.0)
    
    return completion.choices[0]['message']['content']

In [10]:
instruction = """
Write a function that checks a string, and until it starts with a zero, it removes 1 character from the beginning and the end until the starting character is not zero
"""

completion = ChatOAI(user_input=instruction, model = 'gpt-4')

print(completion)

```python
def trim_string_until_zero(s: str) -> str:
    """
    This function checks a string, and until it starts with a zero, 
    it removes 1 character from the beginning and the end until the starting character is not zero.

    Parameters:
    s (str): The input string. It is expected to be a non-empty string.

    Returns:
    str: The trimmed string that starts with a zero.

    """
    
    # Check if the input string is empty
    if not s:
        raise ValueError("Input string cannot be empty")
    
    # While the first character of the string is not '0'
    while s and s[0] != '0':
        # Remove the first and the last character from the string
        s = s[1:-1]
        
    return s
```

This function uses Python's string slicing feature to remove the first and the last character from the string. The while loop continues until the first character of the string is '0' or the string is empty. If the string becomes empty during the process, it means that there was no '0

#### ChatGPT code - packed into function

In [7]:
#history = []

def Ask_Your_Knowledge_Base(user_input = "What is RyanAir's cancellation policy?",
                            model = 'gpt-3.5-turbo',
                            db = db,
                            embeddings = embeddings):
    
    # history_list = history
    ### This function does not have memory, meaning follow-up capabilities are not yet implemented
    ### Every question is a 'new' context


    #### PROMPT PARTS ####

    PROMPT = """
    You are an intelligent assistant helping humans with their questions related to a wide variety of documents. 
    Answer ONLY with the facts listed in the list of sources below. 
    If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. 
    If asking a clarifying question to the user would help, ask the question. 
    Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. 
    Use square brackets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf].

    Sources:
    {sources}

    Question: {question}

    Answer:
    
    """


    #### PROCESS QUERY; RUN SIMILARITY SEARCH ####

    query_embedded = embeddings.embed_query(user_input)

    #search = user_input
    #print("Searching:", search)
    #print("-------------------")

    sim_docs = db.max_marginal_relevance_search_by_vector(query_embedded, k = 5)
    results = [doc.metadata['source'].split("\\")[-1] + "-page-" + str(doc.metadata['page'] )+ ": " + doc.page_content.replace("\n", "").replace("\r", "") for doc in sim_docs]
    content = "\n".join(results)


    #### CRAFT MESSAGE TO CHATCOMPLETION ####

    message = [{'role' : 'user', 'content' : PROMPT.format(sources = content, question = user_input)}]
    

    #### CALL CHATGPT API ####

    completion = openai.ChatCompletion.create(
        model=model, 
        messages=message, 
        temperature=0.0, 
        max_tokens=512,)
    

    #### APPEND TO HISTORY ####
    #history_list.append("Human: " + user_input)
    #history_list.append("AI: " + completion.choices[0]['message']['content'])
    #print("\n-------------------\n".join(history_list))  
    #  
    return completion.choices[0]['message']['content']

NameError: name 'db' is not defined

In [162]:
Ask_Your_Knowledge_Base()

"If RyanAir cancels a flight, fails to operate the flight according to schedule, or cancels the route, passengers may be entitled to rights set out in the Montreal Convention 1999 or the applicable Passenger Rights Regulations. If a flight is cancelled or delayed by two hours or more, RyanAir will provide information on passengers' rights to compensation and assistance. If a passenger cancels their booking up to 14 days prior to the scheduled time of departure of their flight, they will be entitled to a refund of the total fare after deduction of the cancellation fee. If a passenger cancels their booking within 14 days prior to the scheduled time of departure of their flight, they will be refunded the amount of the total fare after deduction of the fee for other services and the seat protection fee. [ryanair_conditions.pdf-page-16][ryanair_conditions.pdf-page-114][wizzair_conditions.pdf-page-18]"

In [15]:
Ask_Your_Knowledge_Base('Are dogs allowed on WizzAir?')

Recognised Assistance Dogs are allowed on WizzAir flights with no additional fee, subject to a limit of one dog per passenger/flight. However, therapy dogs or any emotional support animals are not allowed. The passenger needs to inform WizzAir at least 48 hours before the scheduled departure and provide official documentation confirming that the dog meets the requirements of the Hungarian SZMM decree of 27/2009 and EU Pet Travel Scheme Regulation and the vaccination and treatment requirements of the country they are traveling to. [wizzair_conditions.pdf-page-22][wizzair_conditions.pdf-page-29][wizzair_conditions.pdf-page-37]


In [16]:
Ask_Your_Knowledge_Base("What is Easy Jet's refund policy?")

If you cancel, miss or do not take your flight, this does not affect the other flights on your Booking. You can claim a full refund of Government Tax (as listed in Fees and Charges) for the flight(s) you do not take. You can do this by contacting their Customer Services Team. They will only pay refunds to the Booker using the original payment method or original easyJet account, as applicable. If the original payment method or account is no longer available, they will refund to an alternative payment method in the Booker’s name. If you have booked through an Agent, they will be responsible for passing on any refund to you. They are not responsible for any fees the Agent may apply for processing the refund. Refunds will normally be made in the original currency in which the Booking was made. [easyjet_conditions.pdf-page-6]


In [17]:
Ask_Your_Knowledge_Base('Can I carry a gun on RyanAir?')

No, you cannot carry a gun on Ryanair. Guns are not listed as an item that can be carried on board [ryanair_conditions.pdf-page-11][ryanair_conditions.pdf-page-13]. Additionally, items that could be used as a weapon, such as knives, are prohibited [wizzair_conditions.pdf-page-36].


In [18]:
Ask_Your_Knowledge_Base('How far in advance can I cancel on AirBNB?')

According to the Terms of Service on the Airbnb Help Center [Terms of Service - Airbnb Help Center.html-page-ALL], the cancellation policy that applies to a reservation depends on the situation. If something outside of your control requires you to cancel a reservation, you may be entitled to a partial or full refund under the Extenuating Circumstances Policy. If the host cancels or you experience a Travel Issue, you may be entitled to rebooking assistance or a partial or full refund under the Rebooking and Refund Policy. Different policies apply to certain categories of listings. It is recommended to check each Additional Legal Term or Policy for details about what is covered and what refund applies in each situation. There is no specific information about how far in advance you can cancel.


In [19]:
Ask_Your_Knowledge_Base('If I cannot travel, does Booking.com offer refunds?')

If you cannot travel and you have booked additional travel services for your trip or vacation via a link provided to you by Booking.com no later than 24 hours after receiving confirmation of your initial Booking with them, you will not benefit from rights applying to packages under the EU’s Directive (EU) 2015/2302 or the UK's Package Travel and Linked Travel Arrangements Regulations 2018. Therefore, Booking.com will not be responsible for the proper performance of those travel services. For any issues, contact the relevant Service Provider. If you want to apply for a refund, you must do so in writing no more than 14 days after your pick-up time. [Booking.com_ Terms and Conditions..html-page-ALL]


## Summarization prompts

In [8]:
glob.glob(path + "*.*")

['../docs\\Booking.com_ Terms and Conditions..html',
 '../docs\\easyjet_conditions.pdf',
 '../docs\\ryanair_conditions.pdf',
 '../docs\\Terms of Service - Airbnb Help Center.html',
 '../docs\\wizzair_conditions.pdf']

In [127]:
def Summarize_Short_Doc(input_doc = '',  model = 'gpt-3.5-turbo'):

    #### PROMPT PARTS ####

    PROMPT = """
    Given the below document, your job is to create a summary of the text.
    Do not add any of your internal knowledge to the context.
    Feel free to create longer summaries, including all relevant information found in the original text.

    Document: {document}

    Summary:
    
    """

    message = [{"role": "system", "content": "You are an intelligent assistant helping humans understand contents of long documents."},
               {'role' : 'user', 'content' : PROMPT.format(document = input_doc)}]
    

    #### CALL CHATGPT API ####

    completion = openai.ChatCompletion.create(
        model=model, 
        messages=message, 
        temperature=0.0, 
        max_tokens=1024,)
    
    return completion.choices[0]['message']['content']
    
    
    
def Summarize_Given_Document(input_doc = '',
                             model = 'gpt-3.5-turbo',
                             temperature = 0.0,
                             docs_original = docs_original,
                             tiktoken_encoding = encoding):
    
    llm_summ = ChatOpenAI(model_name=model, temperature=temperature)
    chain = load_summarize_chain(llm_summ, chain_type="map_reduce")

    if input_doc not in glob.glob(path + "*.*"):
        print('Document not found - choose existing document')
        return

    WHOLE_DOC = ' '.join([i.page_content for i in docs_original if i.metadata['source'] == input_doc])
    input_tokens = tiktoken_encoding.encode(WHOLE_DOC)

    print('Given', str(len(input_tokens)), 'number of input tokens we should expect a summary in', str(round(len(input_tokens) / 1000 * 4.2 / 60, 2)), 'minutes')
    print('Approximate cost:', str(round(0.0015 * len(input_tokens) / 1000, 5)), 'USD')

    want_to_continue = input('Do you want to proceed? (y | n)', )
    if want_to_continue == 'n':
        print('Stopping before calling OpenAI API')
        return

    if len(input_tokens) <= 2700:
        summary = Summarize_Short_Doc(input_doc)

    else:
        text_splitter_for_summ = RecursiveCharacterTextSplitter(chunk_size = 7500, chunk_overlap = 200)
        texts_for_summ = text_splitter_for_summ.split_text(WHOLE_DOC)
        docs_for_summ = [Document(page_content=t) for t in texts_for_summ]
        #summary = chain.run(docs_for_summ)
        summary = ' '

        return summary

In [130]:
easyjet_summary = Summarize_Given_Document(input_doc=r"../docs\easyjet_conditions.pdf")

Given 25469 number of input tokens we should expect a summary in 1.78 minutes
Approximate cost: 0.0382 USD


In [126]:
easyjet_summary

"EasyJet's terms and conditions cover bookings, cancellations, refunds, changes, baggage, conduct, liability, and claims. Passengers must comply with policies and regulations, and EasyJet reserves the right to refuse carriage or take action to protect their systems. Passengers are responsible for their children and must provide accurate information. EasyJet is not liable for missed flights or costs incurred, except as set out in section 5.4. Passengers must check-in online and arrive at the airport in advance. EasyJet's liability is governed by the rules and limitations under the Convention, unless such carriage is not international carriage to which the Convention applies, in which case any other Applicable Law shall apply. Passengers must have adequate travel insurance and claims must be made in writing with supporting documentation. EasyJet controls the processing of personal data and is governed by the laws of England and Wales."

In [132]:
_ = Summarize_Given_Document(input_doc=r"../docs\ryanair_conditions.pdf")

Given 87948 number of input tokens we should expect a summary in 6.16 minutes
Approximate cost: 0.13192 USD


Stopping before calling OpenAI API


### Test reading in files for `streamlit`

In [2]:
from io import BytesIO
from pypdf import PdfReader

In [35]:
f = '../docs/easyjet_conditions.pdf'
ff = PdfReader(f)

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 200)

In [39]:
def load_pdf(pdf_as_bytes, splitter = text_splitter):

    text = ''
    DOCS = []

    for pagenum, page in enumerate(pdf_as_bytes.pages):

        page_text = page.extract_text()

        text += page_text

        text_splitted = splitter.split_text(page_text)
        docs = [Document(page_content=t, metadata={'source' : 'f', 'page' : str(pagenum+1)}) for t in text_splitted]
        DOCS.append(docs)

    DOCS = [item for sublist in DOCS for item in sublist]

    return DOCS, text

In [40]:
DOCS, whole_text = load_pdf(ff)

In [4]:
import docx2txt

In [5]:
def load_docx(file, splitter = text_splitter, filename = 'txt'):

    DOCS = []

    text = docx2txt.process(file) 
    text = re.sub(r"\n\s*\n", "\n\n", text)

    text_splitted = splitter.split_text(text)
    docs = [Document(page_content=t, metadata={'source' : filename, 'page' : 'all'}) for t in text_splitted]
    DOCS.append(docs)

    DOCS = [item for sublist in DOCS for item in sublist]

    return DOCS

In [6]:
path = '../../openAI_experiments/docs/FakeRewrittenPressRelease.docx'

In [7]:
i = load_docx(path)

In [9]:
i=docx2txt.process(path) 

In [10]:
docx2txt.process()

"June 8, 2023\n\n\nTOZO Introduces Zivpad 17 OLED: An AI-Powered Marvel Fueled by BNE Tiger G-Series Chips\n\nA 17-inch laptop designed for daily use, featuring striking visuals and seamless performance, powered by up to four BNE Tiger 8000 G-Series AI-enabled processors.\n\nPowerhouse Performance: Four BNE Tiger™ 8000 G-Series AI-enabled processors, 32GB RAM, 1.5TB SSD, ColdKnife fan, dual-vent cooling, WiFi 6E\n\nExceptional Visuals: Pioneering 3.4K, 120Hz OLED Pantone® Validated 560-nit NanoEdge display with a complete DCI-P3 color gamut.\n\nErgonomic Design: Compact and lightweight; metallic finish; 180° lay-flat hinge; webcam shield; TOZO Ergo keyboard; TOZO Super Antimicrobial Guard\n\nEnhanced Conferencing: Superior video and audio via TOZO 3D Noise Reduction and TOZO AI-powered noise-canceling audio technology\n\nEco-friendly and Robust: 100% recyclable packaging; exceeds ENERGY STAR® requirements; US MIL-STD-810H military-grade toughness\n\nDenver, Colorado, June 8, 2023 —\xa0

In [53]:
from langchain.document_loaders import PyPDFLoader, UnstructuredHTMLLoader, UnstructuredFileLoader

In [24]:
from langchain.text_splitter import MarkdownTextSplitter

In [28]:
html_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.MARKDOWN, chunk_size=1500, chunk_overlap=200)
markdown_splitter = MarkdownTextSplitter()

In [29]:
def load_HTML(file, splitter = html_splitter, filename = 'html'):

    DOCS = []

    text = open(file, 'r', encoding = 'utf-8').read()
    text = re.sub(r"\n\s*\n", "\n\n", text)

    text_splitted = splitter.split_text(text)
    docs = [Document(page_content=t, metadata={'source' : filename, 'page' : 'all'}) for t in text_splitted]
    DOCS.append(docs)

    DOCS = [item for sublist in DOCS for item in sublist]

    return DOCS

In [30]:
path = '../../openAI_experiments/docs/docs_to_index/Booking.com_ Terms and Conditions..html'

In [33]:
text = open(path, 'r', encoding = 'utf-8').read()

In [64]:
from bs4 import BeautifulSoup

In [65]:
def clean_HTML(html):

    soup = BeautifulSoup(html, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    print(text)

In [None]:
clean_HTML(text)

In [33]:
PRICING = {'gpt-3.5-turbo' : {'input_tokens' : 0.0015, 'output_tokens' : 0.002},
           'gpt-3.5-turbo-16k' : {'input_tokens' : 0.003, 'output_tokens' : 0.004},
           'gpt-4' : {'input_tokens' : 0.03, 'output_tokens' : 0.06},}

NUM_OF_FILES = 100
PAGES_PER_FILE = 10

WORDS_PER_PAGE = 500
TOKEN_TO_WORDS = 0.75

AVG_PROMPT_MESSAGE = 250
AVG_COMPLETION_LENGTH = 300

TOTAL_INPUT_TOKENS = NUM_OF_FILES * PAGES_PER_FILE * (WORDS_PER_PAGE / TOKEN_TO_WORDS) + AVG_PROMPT_MESSAGE
TOTAL_OUTPUT_TOKENS = AVG_COMPLETION_LENGTH

In [5]:
print('Total number input words:', NUM_OF_FILES * PAGES_PER_FILE * WORDS_PER_PAGE)

Total number input words: 500000


In [18]:
plot_dict = {'Models': PRICING.keys(), 
             'Input price ($)': [round(i['input_tokens'] * TOTAL_INPUT_TOKENS / 1000, 4) for i in PRICING.values()], 
             'Output price ($)': [round(i['output_tokens'] * TOTAL_OUTPUT_TOKENS / 1000, 4) for i in PRICING.values()]}

In [19]:
import plotly.express as px

In [32]:
fig = px.bar(plot_dict, y="Models", 
             x=["Input price ($)", "Output price ($)"], 
             title="Expected cost ($) associated with `OpenAI` models", 
             text_auto=True, log_x=True, template = 'plotly_white',
             height = 400, width = 800, labels = {'value' : 'Price in USD'})
fig.update_layout(legend_title_text='Token category', 
                  legend = {'orientation' : 'h', 'xanchor' : 'right', 'yanchor' : 'bottom', 'y' : -0.5, 'x' : 1})
fig.show()