In [22]:
# Build a sample vectorDB
from langchain.text_splitter import RecursiveCharacterTextSplitter, KonlpyTextSplitter
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader, Docx2txtLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.retrievers  import BM25Retriever
from langchain.retrievers import EnsembleRetriever



from pathlib import Path
import shutil

# Load blog post
# Load docs
from dotenv import load_dotenv

load_dotenv()  # 이 함수를 호출하면 .env 파일의 내용이 환경 변수로 로드됩니다

doc_paths = [
    "docs/가스계통_운영규정.pdf",
    "docs/여비규정.pdf",
    "docs/취업규칙.pdf",
]

docs = [] 
for doc_file in doc_paths:
    file_path = Path(doc_file)

    print("doc_file", doc_file)
    try:
        if doc_file.endswith(".pdf"):
            loader = PyPDFLoader(file_path)

        docs.extend(loader.load())

    except Exception as e:
        print(f"Error loading document {doc_file.name}: {e}")


# Split docs

text_splitter = SemanticChunker(
    # chunk_size=1000,
    # chunk_overlap=300,
    OpenAIEmbeddings()
)

document_chunks = text_splitter.split_documents(docs)

doc_file docs/가스계통_운영규정.pdf
doc_file docs/여비규정.pdf
doc_file docs/취업규칙.pdf


In [23]:
# Tokenize and load the documents to the vector store

vector_db = Chroma.from_documents(
    documents=document_chunks,
    embedding=OpenAIEmbeddings(),
)

bm25_retriever = BM25Retriever.from_documents(document_chunks)

bm25_retriever.k = 5

# Create MultiQueryRetriever
llm = ChatOpenAI(temperature=0)
multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=vector_db.as_retriever(search_kwargs={'k': 5}),
    llm=llm
)

In [24]:
# Retrieve

def _get_context_retriever_chain(vector_db, llm):
    chroma_retriever = vector_db.as_retriever(search_kwargs={'k':5})

    ensemble_retriever = EnsembleRetriever(
        # retrievers=[bm25_retriever, chroma_retriever, multi_query_retriever],
        # weights=[0.2, 0.4, 0.4]

        retrievers=[chroma_retriever, bm25_retriever],
        weights=[0.5, 0.5]        
    )


    prompt = ChatPromptTemplate.from_messages([
        MessagesPlaceholder(variable_name="messages"),
        ("user", "{input}"),
        ("user", "Given the above conversation, generate a search query to look up in order to get inforamtion relevant to the conversation, focusing on the most recent messages."),
    ])
    retriever_chain = create_history_aware_retriever(llm, ensemble_retriever, prompt)

    return retriever_chain

In [25]:
def get_conversational_rag_chain(llm):
    retriever_chain = _get_context_retriever_chain(vector_db, llm)

    prompt = ChatPromptTemplate.from_messages([
        ("system",
        """You are an assistant designed specifically for answering queries based on company regulations. Always respond strictly according to the company's internal regulations, ensuring your answers are aligned with these rules. 
        When providing an answer, first cite the most relevant regulation in detail, including chapter and section numbers if applicable. If multiple regulations apply, list all relevant ones before giving your response. 
        Your goal is to provide the user with clear guidance based on the regulations, so be as specific as possible with the details of the rules and regulations before proceeding with the final answer.
        If no regulation directly applies, inform the user and give your best guidance based on your knowledge of the company's practices.\n
        {context}"""),
        MessagesPlaceholder(variable_name="messages"),
        ("user", "{input}"),
    ])
    stuff_documents_chain = create_stuff_documents_chain(llm, prompt)

    return create_retrieval_chain(retriever_chain, stuff_documents_chain)

In [26]:
# Augmented Generation

llm_stream_openai = ChatOpenAI(
    model="gpt-4o",  # Here you could use "o1-preview" or "o1-mini" if you already have access to them
    temperature=0,
    streaming=True,
)

In [27]:


llm_stream = llm_stream_openai  # Select between OpenAI and Anthropic models for the response

messages = [
    {"role": "user", "content": "Hi"},
    {"role": "assistant", "content": "Hi there! How can I assist you today?"},
    {"role": "user", "content": "일이 너무 많아서 연차휴가를 할당된 만큼 다 쓰지 못할거 같아. 그럼 남는 연차휴가는 어떻게 되지? 남는 연차 휴가에 대해 돈으로 받을수 있어?"},
]
messages = [HumanMessage(content=m["content"]) if m["role"] == "user" else AIMessage(content=m["content"]) for m in messages]

conversation_rag_chain = get_conversational_rag_chain(llm_stream)
response_message = "*(RAG Response)*\n"
for chunk in conversation_rag_chain.pick("answer").stream({"messages": messages[:-1], "input": messages[-1].content}):
    response_message += chunk
    print(chunk, end="", flush=True)

messages.append({"role": "assistant", "content": response_message})

제29조(연차유급휴가)에 따르면, 연차휴가는 직원의 자유의사에 따라 적치하여 계산기간 만료 익일부터 1년 이내에 사용해야 합니다. 또한, 공사가 제29조 제1항, 제3항 및 제4항에 따른 연차유급휴가 중 12일의 범위 내에 근로기준법 제61조에 따라 사용촉진조치를 시행하며 그 미사용 휴가는 연간 10일 한도로 5년간 저축 사용 할 수 있으나 5년 이내 또는 퇴직 시까지 사용하지 아니한 저축 연차휴가는 자동소멸됩니다. 

제29조 제6항에 따르면, 공사의 형편에 따라 연차휴가를 사용할 수 없거나 적치하지 아니한 때에는 보수규정이 정하는 바에 따라 수당을 지급한다고 명시되어 있습니다.

따라서, 연차휴가를 다 사용하지 못할 경우, 회사의 형편에 따라 수당으로 받을 수 있는 가능성이 있습니다. 다만, 이는 보수규정에 따라 결정되므로 구체적인 사항은 해당 규정을 참고하시기 바랍니다.

In [28]:
# Build a sample vectorDB
from langchain.text_splitter import RecursiveCharacterTextSplitter, KonlpyTextSplitter
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader, Docx2txtLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.retrievers  import BM25Retriever
from langchain.retrievers import EnsembleRetriever

from pathlib import Path
import shutil

# Load docs

doc_paths = [
    "docs/가스계통_운영규정.pdf",
    "docs/여비규정.pdf",
    "docs/취업규칙.pdf",
]

docs = [] 
for doc_file in doc_paths:
    file_path = Path(doc_file)

    print("doc_file", doc_file)
    try:
        if doc_file.endswith(".pdf"):
            loader = PyPDFLoader(file_path)

        docs.extend(loader.load())

    except Exception as e:
        print(f"Error loading document {doc_file.name}: {e}")

# Split docs

text_splitter = SemanticChunker(
    OpenAIEmbeddings()
)

document_chunks = text_splitter.split_documents(docs)

# Tokenize and load the documents to the vector store

vector_db = Chroma.from_documents(
    documents=document_chunks,
    embedding=OpenAIEmbeddings(),
)

# BM25Retriever
bm25_retriever = BM25Retriever.from_documents(document_chunks)
bm25_retriever.k = 5

# Chroma Retriever
chroma_retriever = vector_db.as_retriever(search_kwargs={'k':5})

# Ensemble Retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[chroma_retriever, bm25_retriever],
    weights=[0.5, 0.5]        
)

# Create MultiQueryRetriever
llm = ChatOpenAI(temperature=0)

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=ensemble_retriever,
    llm=llm
)

def _get_context_retriever_chain(vector_db, ensemble_retriever, llm):
    multi_query_retriever = MultiQueryRetriever.from_llm(
        retriever=ensemble_retriever,
        llm=llm
    )

    prompt = ChatPromptTemplate.from_messages([
        MessagesPlaceholder(variable_name="messages"),
        ("user", "{input}"),
        ("user", "Given the above conversation, generate a search query to look up in order to get information relevant to the conversation, focusing on the most recent messages."),
    ])
    retriever_chain = create_history_aware_retriever(llm, ensemble_retriever, prompt)

    return retriever_chain

def get_conversational_rag_chain(llm, retriever):
    retriever_chain = _get_context_retriever_chain(vector_db, retriever, llm)

    prompt = ChatPromptTemplate.from_messages([
        ("system",
        """You are an assistant designed specifically for answering queries based on company regulations. Always respond strictly according to the company's internal regulations, ensuring your answers are aligned with these rules. 
        When providing an answer, first cite the most relevant regulation in detail, including chapter and section numbers if applicable. If multiple regulations apply, list all relevant ones before giving your response. 
        Your goal is to provide the user with clear guidance based on the regulations, so be as specific as possible with the details of the rules and regulations before proceeding with the final answer.
        If no regulation directly applies, inform the user and give your best guidance based on your knowledge of the company's practices.
        
        After your explanation, provide the exact quotes from the relevant regulations under a "Source Regulations:" section. Format each quote as follows:
        [Document Name] Chapter X, Section Y: "Exact quote from the regulation"

        {context}"""),
        MessagesPlaceholder(variable_name="messages"),
        ("user", "{input}"),
    ])
    stuff_documents_chain = create_stuff_documents_chain(llm, prompt)

    return create_retrieval_chain(retriever_chain, stuff_documents_chain)

llm_stream_openai = ChatOpenAI(
    model="gpt-4",  # Here you could use "gpt-4-1106-preview" or "gpt-4-0125-preview" if you have access to them
    temperature=0,
    streaming=True,
)

messages = [
    {"role": "user", "content": "Hi"},
    {"role": "assistant", "content": "Hi there! How can I assist you today?"},
    {"role": "user", "content": "일이 너무 많아서 연차휴가를 할당된 만큼 다 쓰지 못할거 같아. 그럼 남는 연차휴가는 어떻게 되지? 남는 연차 휴가에 대해 돈으로 받을수 있어?"},
]
messages = [HumanMessage(content=m["content"]) if m["role"] == "user" else AIMessage(content=m["content"]) for m in messages]

conversation_rag_chain = get_conversational_rag_chain(llm_stream_openai, multi_query_retriever)
response_message = "*(RAG Response)*\n"
for chunk in conversation_rag_chain.pick("answer").stream({"messages": messages[:-1], "input": messages[-1].content}):
    response_message += chunk
    print(chunk, end="", flush=True)

messages.append({"role": "assistant", "content": response_message})

doc_file docs/가스계통_운영규정.pdf
doc_file docs/여비규정.pdf
doc_file docs/취업규칙.pdf
회사 규정에 따르면, 연차휴가는 직원의 자유의사에 따라 적치하여 계산기간 만료 익일부터 1년 이내에 사용해야 합니다. 또한, 회사는 제29조 제1항, 제3항 및 제4항에 따른 연차유급휴가 중 12일의 범위 내에 근로기준법 제61조에 따라 사용촉진조치를 시행하며 그 미사용 휴가는 연간 10일 한도로 5년간 저축 사용 할 수 있습니다. 하지만, 5년 이내 또는 퇴직 시까지 사용하지 아니한 저축 연차휴가는 자동소멸됩니다.

또한, 공사의 형편에 따라 연차휴가를 사용할 수 없거나 적치하지 아니한 때에는 보수규정이 정하는 바에 따라 수당을 지급한다는 규정이 있습니다. 이는 특정 상황에서만 적용되며, 일반적으로 연차휴가를 돈으로 환산하여 받는 것은 허용되지 않습니다.

Source Regulations:
[근