In [9]:
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter,TokenTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from uuid import uuid4
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
AZURE_OPENAI_DEPLOYMENT_ID = os.getenv('AZURE_OPENAI_DEPLOYMENT_ID')
AZURE_OPENAI_KEY = os.getenv('AZURE_OPENAI_KEY')
AZURE_API_VERSION = os.getenv('AZURE_API_VERSION')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=PINECONE_API_KEY)

In [3]:
llm = AzureChatOpenAI(
            azure_endpoint=AZURE_OPENAI_ENDPOINT,
            azure_deployment=AZURE_OPENAI_DEPLOYMENT_ID,
            api_version=AZURE_API_VERSION,
            api_key=AZURE_OPENAI_KEY,
            temperature=0.0,
            verbose=True,
        )

embedding_llm = AzureOpenAIEmbeddings(
            azure_endpoint=AZURE_OPENAI_ENDPOINT,
            azure_deployment='embedding-ada-crayon',
            api_key=AZURE_OPENAI_KEY,
            api_version=AZURE_API_VERSION,
        )

In [4]:
index_name = "ocbc-hr-gpt"  # change if desired
index = pc.Index(index_name)

In [5]:
loader = PyPDFLoader(r"c:\Users\san\Downloads\Dummy - CB Policy.pdf")
pages = loader.load_and_split()

In [13]:
pc = Pinecone(api_key=PINECONE_API_KEY)
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(pages)

In [14]:
splits

[Document(metadata={'source': 'c:\\Users\\san\\Downloads\\Dummy - CB Policy.pdf', 'page': 0}, page_content='1 \n \nOCBC Information Classification: Internal   \n \n \n \n \n \n \n \n \nCompensation & Benefits \nPolicy  \nPT CRAYON SHINCHAN  \n \nPolicy Effective Date: < 07/08/24> \n \n \n \n \n \n \nNo part of this documentation may be reproduced or transmitted in any form or by any means, electronic \nor mechanical, including photocopying or recording, for any purpose without express written permission \nof the CEO of PT CRAYON  SHINCHAN.  \n \n© 2021, <Company Name Here>. All Rights Reserved'),
 Document(metadata={'source': 'c:\\Users\\san\\Downloads\\Dummy - CB Policy.pdf', 'page': 1}, page_content='2 \n \nOCBC Information Classification: Internal   \nRevision History  \n  \nVer \nNo. Change \nDescription  Prepared \nBy Reviewed By  Approved \nBy Date  \n      \n  \n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nTABLE OF CONTENTS  \n \n

In [42]:
pc = Pinecone(api_key=PINECONE_API_KEY)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(pages)
vector_store = PineconeVectorStore(index=index, embedding=embedding_llm)
vector_store.add_documents(documents=splits)

['69d8ab8c-cad2-4544-8b5b-2354f0a8daf1',
 'd1215168-88ea-4d52-bb10-d452289cb95c',
 '95a7d87b-0a33-4083-acf8-c88385ca26e9',
 'dba68d81-2c0e-4c3f-b7c3-04e7e302f1e1',
 'f78c0f00-2400-420f-83ee-cd07fbf2cb5c',
 '6635ba5b-2944-4863-8c24-1ace90594c06',
 'a4df0f04-8222-4aa5-829d-3823d58eef64',
 '999e5651-e0c6-4f55-95f9-6cbb7be246bf',
 '879c2b20-2fc0-40c3-bbb4-614478b152f4',
 '2f8a1314-d7b9-4ede-96e9-9bd38157b302',
 'c6247b2a-d7f2-4873-9566-bda85f9f7996',
 '9185ea85-4bef-4e51-a25d-b17737e8c51e',
 'e944747b-715f-4dd5-b9cb-ed8cd0bbce03',
 'd777420f-430c-4b44-aed5-a7ff872c7649',
 '54528cc8-917d-4de7-9ae1-0d21e3f07f4e',
 'addee4d2-5aaf-40c9-9ed7-e5f460736fb6',
 '4b34959f-b380-4914-883a-003a8dfa15d0',
 '8b1aebe6-8b36-41da-b6c4-20826a8d5cb6',
 '666d54c1-811e-46d7-a37c-fcfe66fab5bf',
 '207b85c4-5339-4418-8458-2662c7516cfa',
 '792c2740-2a78-4874-81f0-547e4ebbd614',
 '4bdb6459-2897-4632-9be0-c1cfb7a102ba']

In [43]:
retriever = vector_store.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is Definition & Glossary?")

'The Definition & Glossary section provides definitions and expansions for various terms and abbreviations used within the organization. It includes definitions for terms such as EoSB (End of Service Benefits), GCC (Gulf Cooperating Countries), and GPSSA (General Pension Social Security Authority).'

In [44]:
retriever.invoke("What is Scope & Applicability ?")

[Document(metadata={'page': 2.0, 'source': 'c:\\Users\\san\\Downloads\\Dummy - CB Policy.pdf'}, page_content='3 \n \nOCBC Information Classification: Internal  Scope & Applicability  4 \nDefinition & Glossary  4 \nPolicy / Process  Error! Bookmark not def ined.  \n4.1 Policy Definition:  Error! Bookmark not defined.  \n4.2. Procedures  Error! Bookmark not defined.  \n4.3. Responsibility – HR department / Finance department  Error! Bookmark not \ndefined.  \nNon-compliance and consequences  1 \nSpecial Circumstances and Exceptions  Error! Bookmark not defined.'),
 Document(metadata={'page': 3.0, 'source': 'c:\\Users\\san\\Downloads\\Dummy - CB Policy.pdf'}, page_content="4 \n \nOCBC Information Classification: Internal   \n1. Objective  \n \nThe objective of the Compensation and Benefits Policy (The Policy) is aimed \ntowards building a strong framework of pay structure for the organization to \ncreate a  competitive work environment for its employees.  \nThe policy aims  to be transpar