# Import Libraries

In [60]:
import os
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import BedrockEmbeddings
from langchain.vectorstores import FAISS
from langchain.indexes import VectorstoreIndexCreator
import pinecone
from langchain.vectorstores import Pinecone
from pinecone.index import Index
from langchain.llms.bedrock import Bedrock
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings.openai import OpenAIEmbeddings


# Setup needed with following steps: 
1. Document Loading and Chunking 
2. Pinecone initiation for index creation
3. Chunk embeddings 
4. Upload to Pinecone


In [61]:
# SETUP STEP 1: Load >> Chunk document
# Data transformation and prepping. This is a publicly available data
data_load = PyPDFLoader('https://www.upl-ltd.com/images/people/downloads/Leave-Policy-India.pdf')

#data chunking -> split and cut the documents into chunks (think about it like paragraph, but based on number of characters)
splitter = RecursiveCharacterTextSplitter(separators =['\n\n', '\n', ' ', ''], chunk_size = 1000, chunk_overlap =150)

data_chunks = data_load.load_and_split()
df = pd.DataFrame({'page_content':[data_chunks[x].page_content for x in range(len(data_chunks))],
                   'page_source': [data_chunks[x].metadata['page'] for x in range(len(data_chunks))],
                   'document_source': [data_chunks[x].metadata['source'] for x in range(len(data_chunks))]
                   })

df['split'] = df.page_content.apply(lambda x: splitter.split_text(x))

final_df = pd.DataFrame()
for idx, row in df.iterrows():
    page_number = row.page_source + 1

    iterative_df = pd.DataFrame({'split_page':row.split})
    iterative_df['page_number'] = page_number
    iterative_df = iterative_df.reset_index().rename(columns={'index':'index_chunk'})

    final_df = pd.concat([final_df, iterative_df], axis=0)

final_df['unique_id'] = final_df.page_number.astype(str) + '#' + final_df.index_chunk.astype(str)

docs = DataFrameLoader(
    final_df, 
    page_content_column = "split_page"
).load()


In [62]:
# SETUP STEP 2: Pinecone initiation to create index. You would need to create Pinecone account. Free tier account
os.environ['PINECONE_API_KEY'] = 'PINECONE_API_KEY'
os.environ['OPENAI_API_KEY'] = "OPENAI_API_KEY"

pinecone.init(
    api_key = os.environ['PINECONE_API_KEY'],
    environment = 'gcp-starter'
    
)

print(pinecone.list_indexes())

index_name = 'hr-policy'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name = index_name,
        metric = 'cosine',
        dimension = 1536
    )

# SETUP STEP 3: Document embeddings
data_embeddings = BedrockEmbeddings(
        credentials_profile_name = 'default',
        model_id = 'amazon.titan-embed-text-v1'
    )

openai_embeddings = OpenAIEmbeddings(api_key= os.environ['OPENAI_API_KEY'])

index = Index(index_name)

#SETUP STEP 4: Upload to Pinecone
if index.describe_index_stats()['total_vector_count'] > 0:
    docsearch = Pinecone.from_existing_index(
                    index_name,
                    data_embeddings
    )
else:
    docsearch = Pinecone.from_documents(
        data_chunks,
        data_embeddings, 
        index_name = index_name
    )


['hr-policy']


# Retrieval Steps: 
1. Initialise LLM, we are using LLAMA2 from Bedrock in this use case. 
2. Setup Prompt on the Questions and Document Prompt
3. LangChain RetrievalwithQASourcesChain helps with the forwarding of the question prompt with the sources and answer with natural language. 
4. Ask questions

In [63]:
# Testing on docsearch in Pinecone. This is how it looks like on the result on Pinecone when passed to LLM for further processing. 
docsearch.as_retriever().get_relevant_documents('how many leave am I entitled to?')

[Document(page_content='employee must apply the sick leave in the prescribed online platform immediately on return to \nwork. \n \nAny misuse of the sick leave provision will attract disciplinary action and may lead to termination \nof an employee. \n \n4.5 Maternity Leave  \n \nEligibility - Total of 26 weeks twice in the service tenure as governed by ‘The Maternity \nBenefit Act” & Maternity Benefit (amendment) Bill 2017.  \nApart from the standard Maternity leaves mentioned above, women will be \nentitled for additional 6 weeks of Maternity leaves. \nAccumulation -   Not applicable. \nGuidelines: \n \n\uf0b7 Existing provisions as per Maternity Benefit Act, 1961 & Maternity Benefit (amendment) \nBill 2017 and ESI Act, 1948 will be continued.  \n\uf0b7 All women employees are eligible for a total of 32 weeks of maternity leave for 2 surviving \nchildren. The woman employee can proceed on maternity leave up to a maximum of 26 \nweeks but not more than 8 weeks before the expected deliv

In [71]:
def hr_llm():
    llm = Bedrock(
        credentials_profile_name = 'default',
        model_id = 'meta.llama2-13b-chat-v1',
        model_kwargs = {"temperature": 0.1, 'top_p':0.9, 'max_gen_len': 2000}
    )
    return llm 

In [80]:
CONTEXT_PROMPT = """
{page_content}

Document Link and Page: {page} and {source}
=========

"""

QUESTION_PROMPT = """
You are an intelligent chatbot with a limited knowledge base. 
Given the following extracted parts of an HR policy document and a question, create a final answer with the source of document and the page ("SOURCE").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCE" part in your answer. Source should contain the source link and the page number. 

QUESTION: {question}
=========
{summaries}
ANSWER:"""

In [81]:
context_prompt = PromptTemplate.from_template(CONTEXT_PROMPT)
question_prompt = PromptTemplate.from_template(QUESTION_PROMPT)

In [83]:
def embedding():
    data_embeddings = BedrockEmbeddings(
        credentials_profile_name = 'default',
        model_id = 'amazon.titan-embed-text-v1'
    )
    return data_embeddings

In [84]:
def pinecone_retriever():
    docsearch = Pinecone.from_existing_index(
        index_name = 'hr-policy',
        embedding = embedding()
    )   
    return docsearch 

In [85]:
def hr_rag_response(question):
    docsearch = pinecone_retriever()
    
    qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
            chain_type = 'stuff',
            llm = hr_llm(),
            chain_type_kwargs = {
                'prompt': question_prompt, 
                'document_prompt': context_prompt
            },
            retriever = docsearch.as_retriever()
        )
    
    answer = qa_with_sources(question)['answer'].replace("\n\n","")
    
    return answer

In [86]:
#testing HR LLM response
hr_rag_response(question = 'how many leaves am I entitled to?')

'According to the leave policy document provided, an employee is entitled to 21 working days of privilege leave in a calendar year. The source of this information is page 4.2 of the document, which can be found at the following link: <https://www.upl-ltd.com/images/people/downloads/Leave-Policy-India.pdf>.Please note that the document states that employees must avail a minimum of four days of privilege leave in the year, for which L.T.A. is claimed. Additionally, the document specifies that employees must inform their line manager in advance for any leave availed.'