## Env

In [None]:
#!pip -V
!pip install python-dotenv RAGChain bs4

In [None]:
# .env 환경설정 파일을 읽어서 환경변수로 설정
from dotenv import load_dotenv
load_dotenv()

## HWP 문서 가져오기

https://nomadamas.gitbook.io/ragchain-docs/ragchain-structure/file-loader/win32-hwp-loader

In [None]:
from RAGchain.preprocess.loader import Win32HwpLoader

loader = Win32HwpLoader("rawdata/(첨부5) 2024년도 하반기 중남미 지역기구 파견인턴 선발 공고.hwpx")
documents = loader.load()

len(documents)

In [None]:
from pprint import pprint
pprint(documents[0].page_content)

In [None]:
pprint(documents[1].page_content)

## Simple RAG

#### Indexing

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.chat_models import ChatOllama
from langchain.prompts import ChatPromptTemplate

In [None]:
# HugoingFace Embeddings를 다운로드
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model_name = "BM-K/KoSimCSE-roberta-multitask"

embeddings_model = HuggingFaceEmbeddings(
    model_name=embedding_model_name,  
)

# HugoingFace Embedding 모델의 Tokenizer를 사용하여 토큰화
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)

In [None]:
embeddings_model

In [None]:
# Token 수를 기준으ㄹ 문서를 청크 단위로 분할
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer = tokenizer,
    chunk_size = 500,
    chunk_overlap  = 50,
)

split_docs = text_splitter.split_documents(documents)
print(len(split_docs))
print(split_docs[0])

In [None]:
vectorstore = Chroma.from_documents(documents=split_docs, 
                                    embedding=embeddings_model)

#### Retreival

In [None]:
# VectorStore를 사용하여 검색기 생성
retriever = vectorstore.as_retriever(search_kwargs={'k': 2})

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

# Chat Model
llm = ChatOllama(model="llama3.1", temperature=0.1, verbose=True)


def format_docs(docs):
    # 중복을 제거
    unique_contents = set(doc.page_content for doc in docs)
    
    # \n\n으로 연결하여 문자열로 반환
    return "\n\n".join(unique_contents)


# RAG Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Chain 실행
response = rag_chain.invoke("파견 기간은 얼마인가요?")
print(response)

In [None]:
retriever.get_relevant_documents("파견 기간은 얼마인가요?")

In [None]:
response = rag_chain.invoke("필기시험이 있는 날짜는 언제인가요?")
print(response)

In [None]:
retriever.get_relevant_documents("필기시험이 있는 날짜는 언제인가요?")

In [None]:
response = rag_chain.invoke("국제이주기구 파견은 언제 하나요?")
print(response)

In [None]:
retriever.get_relevant_documents("국제이주기구 파견은 언제 하나요?")

In [None]:
retriever.get_relevant_documents("IOM")

## Table Data

In [None]:
for d in split_docs:
    if "IOM" in d.page_content:
        print(d.page_content)
        sample_text = d.page_content
        break

#### HTML 테이블 구조를 컨텍스트로 전달

In [None]:
sample_text

In [None]:
len(tokenizer.encode(sample_text))

In [None]:
llm.invoke(f"""Answer the question based only on the following context:
{sample_text}

Question: 국제이주기구 파견은 언제 하나요?
""")

#### Markdown 형식으로 테이블 데이터를 변환하여 컨텍스트로 전달

In [None]:
import pandas as pd
from bs4 import BeautifulSoup

# BeautifulSoup을 사용하여 HTML 파싱
soup = BeautifulSoup(sample_text, 'html.parser')

# 테이블 찾기
table = soup.find('table')

# Pandas DataFrame으로 변환
df = pd.read_html(str(table))[0]

# DataFrame을 Markdown으로 변환
markdown_table = df.to_markdown(index=False)

print(markdown_table)

In [None]:
llm.invoke(f"""Answer the question based only on the following context:
{markdown_table}

Question: 국제이주기구 파견은 언제 하나요?
""")

In [None]:
len(tokenizer.encode(markdown_table))

## Text 와 Table 구분하여 청크로 분할

In [None]:
import pandas as pd
from bs4 import BeautifulSoup

def convert_html_table_to_markdown(html_table_text: str) -> str:

    # BeautifulSoup을 사용하여 HTML 파싱
    soup = BeautifulSoup(html_table_text, 'html.parser')

    # 테이블 찾기
    table = soup.find('table')

    # Pandas DataFrame으로 변환
    df = pd.read_html(str(table))[0]

    # DataFrame을 Markdown으로 변환
    markdown_table = df.to_markdown(index=False)

    return markdown_table

In [None]:
documents[0].metadata

In [None]:
markdown_documents = []

for doc in documents:
    if doc.metadata.get("page_type")=='table':

        try:
            doc.page_content = convert_html_table_to_markdown(doc.page_content)
            markdown_documents.append(doc)
        except:
            markdown_documents.append(doc)
    else:
        markdown_documents.append(doc)


len(markdown_documents)
            

In [None]:
new_split_docs = text_splitter.split_documents(markdown_documents)
print(len(new_split_docs))
print(new_split_docs[0])

In [None]:
vectorstore2 = Chroma.from_documents(documents=new_split_docs, 
                                    embedding=embeddings_model)


retriever2 = vectorstore2.as_retriever(search_kwargs={'k': 2})

rag_chain2 = ( 
    {"context": retriever2 | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

response = rag_chain2.invoke("국제이주기구 파견은 언제 하나요?")
response

In [None]:
retriever2.get_relevant_documents("국제이주기구 파견은 언제 하나요?")