# Prerequisites
본 `ipynb` 은 `Python=3.12` 에서 작성하였습니다. Package dependency 를 해결하기 위해 아래 cell 을 실행해주세요.

## Install Python packages

In [None]:
%pip -q install -U graphrag azure-ai-documentintelligence langchain langchain-community langchain-openai

## Load environment variables from a .env file
secret 노출을 피하고 notebook 들간의 일관된 환경변수를 설정하기 위해 `dotenv` 을 이용한다.

In [None]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)

AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
AZURE_AI_SEARCH_ENDPOINT = os.getenv("AZURE_AI_SEARCH_ENDPOINT")
AZURE_AI_SEARCH_ADMIN_KEY = os.getenv("AZURE_AI_SEARCH_ADMIN_KEY")
AZURE_DOCUMENTINTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENTINTELLIGENCE_ENDPOINT")
AZURE_DOCUMENTINTELLIGENCE_API_KEY = os.getenv("AZURE_DOCUMENTINTELLIGENCE_API_KEY")

# Load Dataset

In [None]:
from langchain_community.document_loaders.doc_intelligence import AzureAIDocumentIntelligenceLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = AzureAIDocumentIntelligenceLoader(
    api_endpoint=AZURE_DOCUMENTINTELLIGENCE_ENDPOINT,
    api_key=AZURE_DOCUMENTINTELLIGENCE_API_KEY,
    file_path="./resources/대한민국 헌법.pdf",
    api_model="prebuilt-read",
)
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
chunks = splitter.split_documents(docs)

# Knowledge Graph

In [None]:
# 아래 명령은 터미널에서 실행하세요.
# %run graphrag --init --root_dir ./graphrag
# %run cp settings.yaml ./graphrag/settings.yaml

In [None]:
from pathlib import Path

Path("graphrag/input").mkdir(parents=True, exist_ok=True)
with open("graphrag/input/chunks.txt", "w", encoding="utf-8") as f:
    for chunk in chunks:
        f.write(chunk.page_content + "\n")

In [None]:
import pandas as pd
from pathlib import Path
import graphrag.api as api
from graphrag.config.load_config import load_config
from graphrag.index.typing.pipeline_run_result import PipelineRunResult

root_dir = Path("./graphrag")

# 1) settings.yaml + .env 로부터 GraphRAG 설정 로드
config = load_config(root_dir)

# 2) 인덱싱 파이프라인 실행
index_result: list[PipelineRunResult] = await api.build_index(config=config)

# 3) 워크플로우별 성공/실패 출력
for workflow_result in index_result:
    status = "success" if not workflow_result.errors else f"error\n{workflow_result.errors}"
    print(f"Workflow: {workflow_result.workflow}\tStatus: {status}")

# 4) 인덱싱 결과인 parquet 파일 로드 (q&a에서 사용)
entities = pd.read_parquet(root_dir / "output" / "entities.parquet")
communities = pd.read_parquet(root_dir / "output" / "communities.parquet")
community_reports = pd.read_parquet(root_dir / "output" / "community_reports.parquet")

print("entities.head():")
print(entities.head())

In [None]:
import pandas as pd
from pathlib import Path

import graphrag.api as api
from graphrag.config.load_config import load_config

PROJECT_DIR = Path("./graphrag")
config = load_config(PROJECT_DIR)

# 인덱싱 후 생성된 parquet 로드
entities = pd.read_parquet(PROJECT_DIR / "output" / "entities.parquet")
communities = pd.read_parquet(PROJECT_DIR / "output" / "communities.parquet")
community_reports = pd.read_parquet(PROJECT_DIR / "output" / "community_reports.parquet")

question = "대통령은 누구이고, 어떤 책임과 의무를 가져 ?"

# ---- Global Search ----
response, context = await api.global_search(
    config=config,
    entities=entities,
    communities=communities,
    community_reports=community_reports,
    community_level=2,                # 커뮤니티 계층 (보통 1~3)
    dynamic_community_selection=False, # true 로 하면 질문에 맞게 커뮤니티 자동 선택
    response_type="Multiple Paragraphs",
    query=question,
)
print("=== Answer ===")
print(response)
print("\n=== Context (debug) ===")
print(context)


# Knowledge Store

In [None]:
# metadata 에 있는 값들을 document 에 들어가기에 중복되는 key 들을 제거한다.
for d in chunks:
    d.metadata.pop("content", None)          # 충돌 키 제거
    d.metadata.pop("content_vector", None)   # (안전)
    d.metadata.pop("id", None)

In [None]:
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores.azuresearch import AzureSearch

index_name = "constitution"

emb = AzureOpenAIEmbeddings(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    azure_deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
    openai_api_version=AZURE_OPENAI_API_VERSION,
)

# 여기선 index name 이 없을 때 schema 를 추론하여 자동 생성해준다.
vs = AzureSearch(
    azure_search_endpoint=AZURE_AI_SEARCH_ENDPOINT,
    azure_search_key=AZURE_AI_SEARCH_ADMIN_KEY,
    index_name=index_name,
    embedding_function=emb.embed_query,
)
vs.add_documents(documents=chunks)

In [None]:
# Perform a similarity search
docs = vs.similarity_search(
    query=question,
    k=3,
    search_type="similarity",
)
for idx, doc in enumerate(docs):
    print(f"\n=== Document {idx + 1} ===")
    print(doc.page_content)