# RAG Pipeline with LangChain

## 1. Setup and Imports

In [1]:
import os
import json
import ast
from dotenv import load_dotenv
from konlpy.tag import Okt

# LangChain components
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Zilliz # FAISS 대신 Zilliz 임포트
from langchain_community.retrievers import BM25Retriever
from langchain_classic.retrievers import EnsembleRetriever

In [2]:
load_dotenv()
try:
    zilliz_uri = os.getenv("ZILLIZ_CLOUD_URI")
    zilliz_token = os.getenv("ZILLIZ_CLOUD_TOKEN")

    if not zilliz_uri:
        raise ValueError("ZILLIZ_CLOUD_URI not set in .env file.")
    if not zilliz_token:
        raise ValueError("ZILLIZ_CLOUD_TOKEN not set in .env file.")
    print("API keys loaded successfully.")
except ValueError as e:
    print(e)

API keys loaded successfully.


## 2. Data Loading and Document Preparation
Load the audit cases and prepare them in LangChain's `Document` format. We create two sets of documents for our hybrid search strategy:
1.  **Summary-based documents:** For semantic search (Zilliz).
2.  **Keyword-focused documents:** For keyword search (BM25), using only the 'problem' and 'action' fields.

In [3]:
def load_and_prepare_docs(filepath="../audit_cases.json"):
    """
    의미 검색과 키워드 검색의 역할을 분리하여 문서를 최적화합니다.
    1. 의미 검색용: 'contents_summary' 기반의 요약 문서를 사용
    2. 키워드 검색용: 'title', 'problem', 'action' 필드를 사용하여 키워드 강화
    """
    print(f"Loading data from {filepath} and preparing optimized documents...")
    with open(filepath, 'r', encoding='utf-8') as f:
        audit_cases = json.load(f)

    semantic_docs = [] # For FAISS (semantic search)
    keyword_docs = []  # For BM25 (keyword search)

    for i, case in enumerate(audit_cases):
        site = case.get('site', '알 수 없음')
        category = case.get('category', '알 수 없음')
        date = case.get('date', '알 수 없음')
        original_title = case.get('title', '')

        metadata = {
            "index": i, "title": original_title, "site": site,
            "category": category, "date": date
        }

        # 1. [의미 검색용 문서] 생성
        summary_dict = {}
        summary_str = case.get('contents_summary')
        if summary_str:
            try:
                summary_dict = ast.literal_eval(summary_str)
            except (ValueError, SyntaxError):
                summary_dict = {}

        title = summary_dict.get('title_str', original_title)
        keywords = ", ".join(summary_dict.get('keyword_list', []))
        problems = summary_dict.get('problems_str', '')
        action = summary_dict.get('action_str', '')
        standards = summary_dict.get('standards_str', '')

        summary_based_text = (
            f"출처: {site}\\n"
            f"분류: {category}\\n"
            f"일자: {date}\\n"
            f"제목: {title}\\n"
            f"핵심 키워드: {keywords}\\n"
            f"문제 요약: {problems}\\n"
            f"조치 요약: {action}\\n"
            f"관련 규정: {standards}"
        )
        semantic_docs.append(Document(page_content=summary_based_text, metadata=metadata))

        # 2. [키워드 검색용 문서] 생성
        problem_raw = case.get('problem', '')
        action_raw = case.get('action', '')
        keyword_optimized_text = f"제목: {original_title}\\n문제점: {problem_raw}\\n조치사항: {action_raw}"
        keyword_docs.append(Document(page_content=keyword_optimized_text, metadata=metadata))


    print(f"  - Created {len(semantic_docs)} documents for semantic search.")
    print(f"  - Created {len(keyword_docs)} documents for keyword search.")
    return semantic_docs, keyword_docs

full_text_documents, keyword_documents = load_and_prepare_docs()

Loading data from ../audit_cases.json and preparing optimized documents...
  - Created 4961 documents for semantic search.
  - Created 4961 documents for keyword search.


## 3. Retriever Setup (Hybrid Search)
We'll set up two retrievers and combine them using `EnsembleRetriever`.

In [4]:
# Initialize models and tokenizers
print("Initializing models and retrievers...")
embeddings = OllamaEmbeddings(model="nomic-embed-text", base_url="http://localhost:11434")
okt = Okt()

# 1. Zilliz (Semantic) Retriever
print("  - Loading Zilliz vector store...")
zilliz_vectorstore = Zilliz(
    embedding_function=embeddings,
    collection_name="audit_cases_gemma_v1", # Zilliz에 업로드한 컬렉션 이름
    connection_args={"uri": zilliz_uri, "token": zilliz_token}
)
semantic_retriever = zilliz_vectorstore.as_retriever(search_kwargs={"k": 5})
print("    - Zilliz retriever ready.")

# 2. BM25 (Keyword) Retriever
print("  - Building BM25 index...")
bm25_retriever = BM25Retriever.from_documents(
    documents=keyword_documents, 
    preprocess_func=lambda s: okt.morphs(s) # Use Okt for tokenization
)
bm25_retriever.k = 5
print("    - BM25 retriever ready.")

# 3. Ensemble (Hybrid) Retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[semantic_retriever, bm25_retriever],
    weights=[0.5, 0.5] # Give equal weight to semantic and keyword search
)
print("Ensemble retriever ready!")

Initializing models and retrievers...


  embeddings = OllamaEmbeddings(model="nomic-embed-text", base_url="http://localhost:11434")


  - Loading Zilliz vector store...
    - Zilliz retriever ready.
  - Building BM25 index...
    - BM25 retriever ready.
Ensemble retriever ready!


## 4. RAG Chain Construction (LCEL)
Now we define the full RAG chain using LangChain Expression Language (LCEL).

In [5]:
prompt_template = """
당신은 감사 전문가입니다. 제공되는 '관련 감사 사례'에 명시적으로 언급된 내용만을 근거로 하여 사용자의 '질문'에 대해 답변해 주세요.
주어진 내용에 근거가 부족하면 '정보 없음'으로 답하세요. 절대로 내용을 추론하거나 암시해서는 안 됩니다.

[관련 감사 사례]
{context}

[질문]
{question}

[답변]
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

# LLM 설정 (Ollama - gemma3:latest)
llm = Ollama(model="gemma3:latest", base_url="http://localhost:11434", temperature=0)

# Helper function to format retrieved documents
def format_docs(docs):
    return "\n\n".join([f"### 감사사례 (제목: {doc.metadata.get('title', 'N/A')}){doc.page_content}" for doc in docs])

# RAG Chain
rag_chain = (
    {"context": ensemble_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
                                                                                                                                
print("RAG chain constructed successfully with Ollama (gemma3:latest).")

RAG chain constructed successfully with Ollama (gemma3:latest).


  llm = Ollama(model="gemma3:latest", base_url="http://localhost:11434", temperature=0)


## 5. Test Queries
**이 셀의 `test_query` 변수만 변경하고 이 셀만 반복적으로 실행하여 다양한 질문을 테스트할 수 있습니다.**

In [6]:
test_query = "부실시공에 따라 재시공을 하도록 한 감사건도 있나?" # 여기에 질문을 변경하세요!

print(f"--- Running RAG chain for query: '{test_query}' ---")

# Invoke the chain and stream the results
for chunk in rag_chain.stream(test_query):
    print(chunk, end="", flush=True)

print("\n--- Execution Complete ---")

--- Running RAG chain for query: '부실시공에 따라 재시공을 하도록 한 감사건도 있나?' ---
네, 부실시공에 따라 재시공을 하도록 한 감사건이 있습니다.

감사원 감기간 중 실시한 전문가 자문 결과, 최대 균열 폭 1.4 ㎜의 균열이 지하차도 박스 및 옹벽 구간 전반에 걸쳐 발생했으며, 박스 구간의 포장 불량률이 재포장 기준 (10%) 을 초과 (14.3%, 옹벽 구간 8.4%) 한 것으로 나타났습니다. 이로 인해 재포장이 필요하게 되었습니다.

--- Execution Complete ---
