## Build Helper Functions

In [18]:
import os
import pinecone

#vector stores
from langchain.vectorstores import Pinecone

#Embeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceInstructEmbeddings

# Document Loaders
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader, UnstructuredMarkdownLoader, WikipediaLoader

# Text Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# LLMs, Memory & Chains
from langchain.chat_models import ChatOpenAI
from langchain.llms import HuggingFaceHub
from langchain.memory import ConversationBufferMemory
from langchain.chains import RetrievalQA
from langchain.chains import ConversationalRetrievalChain


### 1. Load env files

In [24]:
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)

True

### 2. Load Documents ( from a single file or from a directory)

#### 2.1 Load from File
##### Supported formats : .pdf, .docx, .txt, .md

In [3]:
def load_document(file_path):
    name, extension = os.path.splitext(file_path)
    
    if extension == '.pdf':
        loader = PyPDFLoader(file_path)
    elif extension == '.docx':
        loader = Docx2txtLoader(file_path)
    elif extension == '.txt':
        loader = TextLoader(file_path)
    elif extension == '.md':
        loader = UnstructuredMarkdownLoader(file_path)
    else:
        print(f"Unsupported File Format {file_path}. Supported formats: .pdf, .docx, .txt, .md")
        return []    
        
    print(f"Reading {file_path}")
    return loader.load()

#### 2.2 Load from a directory
##### Loads all files from a directory. Supported formats : .pdf, .docx, .txt, .md

In [4]:
##### function that either takes a file path & reads the text from the file 
##### or a folder path & reads the text from each file 
def load_from(path, nested=False):
    ### If a file path is passed
    if os.path.isfile(path):
        return load_document(path)
    
    ### If a directory is passed
    elif os.path.isdir(path):
        print(f"Reading from folder {path}")
        item_paths = [os.path.join(path, f) for f in os.listdir(path)]
        
        loaded_docs = []
        for p in item_paths:
#             print(p)
            if os.path.isfile(p):
                loaded_docs += load_document(p)
            elif nested and os.path.isdir(p):
                loaded_docs += load_from(p)
        
        return loaded_docs

#### 2.2 Load from External sources: 
##### Supoprted Websites: Wiki

In [5]:
def load_from_wiki(query, load_max_docs=1, max_chars_per_doc=5000):
    loader = WikipediaLoader(query=query, load_max_docs=load_max_docs, doc_content_chars_max=max_chars_per_doc)
    data = loader.load()
    return data


### 3. Make Chunks from Documents 

In [6]:
def chunk_data(data, chunk_size=400, chunk_overlap=80):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap )
    chunks = text_splitter.split_documents(data)
    return chunks
    

### 4. Map chukns into embeddings & upload to Pinecone

In [29]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

def insert_or_fetch_embeddings(index_name, chunks, embeddings_type='instruct'):
    
    if embeddings_type == 'instruct':
        embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", )
        embedding_dimension = 768
    elif embeddings_type == 'openai':
        embeddings = OpenAIEmbeddings()
        embedding_dimension = 1536
    else:
        print("Unknown Embeddings type.")
        return None
        
    # Inititate connection to the PineCone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(
                                index_name, 
                                embeddings)
        print('Done')
    else:
        print(f'Creating index {index_name} and mapping chunks into embeddings ...', end='')
        pinecone.create_index(index_name, 
                              dimension=embedding_dimension, 
                              metric='cosine')
        vector_store = Pinecone.from_documents(
                                chunks, 
                                embeddings, 
                                index_name=index_name)
        print('Done')

    return vector_store
        
        



### 5. Build LLM Chain

In [8]:
def get_llm_chain(vector_store, llm_type = 'google', has_memory=False,):
    # 1. LLM
    if llm_type == 'google':
        llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.1}) # repo_id = 'meta-llama/Llama-2-70b-chat'
    elif llm_type == 'openai':
        llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.1)
    
    # 2. Vector Store retriever
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    
    # 3. Define Chain
    if not has_memory:
        chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    else:
        llm_memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
        chain = ConversationalRetrievalChain.from_llm(llm, retriever, memory=llm_memory)
        
    return chain
    
    
    

### 6. Asking Questions & Getting Answers

In [16]:
def ask(query, llm_chain, has_memory=False, chat_history = []):
    if not has_memory:
        answer = llm_chain.run(q)
        chat_history=[]
    else:
        chain_output = llm_chain({"question": query})
        answer, chat_history = chain_output['answer'], chain_output['chat_history']
    return answer, chat_history
        

# Main Program Flow

In [None]:
# def main():
print("-"*70,"\n--"," "*64, "--","\n--"," "*9," Welcome to your private chat application. !"," "*9, "--", "\n--"," "*64, "--\n"+"-"*70)
print("Before you start using your private chatbot, please complete Phase 1 & Phase 2 setiup.!")
print("------- Phase 1: Configure your chat agent ")
print("\n|***| Select the LLM you want to use? Enter the corrresponding number or Press enter to use the deault one")
print("\t1. Google/flan-t5-xxl (default)\n\t2. OpenAI")
llm_choice = input()
llm_type = 'openai' if llm_choice == '2' else 'google'
print(f"\tSelected {'OpenAI' if llm_choice=='2' else 'Google/flan-t5-xxl (default)'}")

print("\n|***| Select the embeddings you want to use? Enter the corrresponding number or Press enter to use the deault one")
print("\t1. instructor-xl (default)\n\t2. OpenAI")
embedding_choice = input()
embeddings_type = 'openai' if embedding_choice=='2' else 'instruct'
print(f"\tSelected {'OpenAI' if embedding_choice=='2' else 'instructor-xl (default)'}")

print("\n|***| Do you want your agent to have memory and remember your older conversations?")
print("\t1. No (default)\n\t2. Yes")
memory_choice = input()
has_memory = True if memory_choice=='2' else False
print(f"\tSelected {'Yes' if memory_choice=='2' else 'No (default)'}")
print("------- Phase 1: Configuration complete -------")

print("\n")
print("------- Phase 2: Build Knowledge Base -------")
print("\n|***| Select where should the LLM build the knowledge base from? Enter the corrresponding number or Press enter to use the deault one")
print("\t1. Local files (Supported formats: '.pdf', '.txt', '.md' (default)\n\t2. Web (Wikipedia)")
input_choice = input()
print(f"\tSelected {'Web (Wikipedia)' if input_choice=='2' else 'Local files'}\n")
if input_choice == '2':
    query=input("Select the topic you want to search.")
    docs = load_from_wiki(query)
else:
    input_path = input("Enter the path (file/ folder) to read documents from. (Supported formats: '.pdf', '.txt', '.md')")
    docs = load_from(input_path)
    

print("\n---- Chunking the Data ... ", end='')
chunks = chunk_data(docs)
print("Done\n")

index_name = input("|***| Enter the name of name of Pinecone index that you want to create or fetch from.")
print("---- Upload Chunks to Vector Store\n")
if embeddings_type == 'openai': print("Estimated cost:", print_embedding_cost(chunks))
vector_store = insert_or_fetch_embeddings(index_name= index_name, chunks= chunks, embeddings_type=embeddings_type)
print("------- Phase 2: Setting up knowledge based vector store complete -------\n")


# 4. Build LLM Chain
llmchain = get_llm_chain(vector_store=vector_store, llm_type = llm_type, has_memory=has_memory,)

#5. Asking Questions & Getting Answers
import time
num = 1
print("*"*77)
print('****************** Your private chatbot is ready for use! ******************')
print("*"*77+"\nEnter Q/ Quit/ Exit to quit.")
while True:
    print(f'{"-" * 50} \n')
    q = input(f'Question #{num}: ')
    num += 1
    if q.lower() in ['q', 'quit', 'exit']:
        print('Conversation ended...')
        time.sleep(2)
        break
    
    answer, chat_history = ask(query=q, llm_chain=llmchain, has_memory=has_memory, chat_history = [])
    print(f'Answer: {answer}')
    if has_memory:
        print(f"Chat History: {chat_history}")
    

    

---------------------------------------------------------------------- 
--                                                                  -- 
--            Welcome to your private chat application. !           -- 
--                                                                  --
----------------------------------------------------------------------
Before you start using your private chatbot, please complete Phase 1 & Phase 2 setiup.!
------- Phase 1: Configure your chat agent 

|***| Select the LLM you want to use? Enter the corrresponding number or Press enter to use the deault one
	1. Google/flan-t5-xxl (default)
	2. OpenAI
2
	Selected OpenAI

|***| Select the embeddings you want to use? Enter the corrresponding number or Press enter to use the deault one
	1. instructor-xl (default)
	2. OpenAI
2
	Selected OpenAI

|***| Do you want your agent to have memory and remember your older conversations?
	1. No (default)
	2. Yes
2
	Selected Yes
------- Phase 1: Configuration complete -

In [21]:
load_from_wiki("Mia Khalifa")



In [13]:
d = llmchain({"question": "Who is harry"})

In [15]:
d['chat_history']

[HumanMessage(content='Who is harry', additional_kwargs={}, example=False),
 AIMessage(content='Harry Potter', additional_kwargs={}, example=False),
 HumanMessage(content='Who is harry', additional_kwargs={}, example=False),
 AIMessage(content='a wizard', additional_kwargs={}, example=False),
 HumanMessage(content='Who is harry', additional_kwargs={}, example=False),
 AIMessage(content='Harry is a boy.', additional_kwargs={}, example=False)]

In [None]:
harry-potter-part1

In [99]:
../../../Code/


