In [None]:
"""
FAISS VectorDB 테스트 스크립트
로컬에 저장된 index.faiss, index.pkl을 로드하여 쿼리 검색 테스트
"""
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from pathlib import Path


class PromptedBGE(HuggingFaceEmbeddings):
    """BGE 임베딩에 프롬프트를 추가하는 래퍼"""

    def embed_documents(self, texts):
        return super().embed_documents(
            [f"Represent this document for retrieval: {t}" for t in texts])

    def embed_query(self, text):
        return super().embed_query(
            f"Represent this query for retrieval: {text}")


def load_vectorstore(vectordb_path="./vectorDB"):
    """저장된 FAISS 인덱스 로드"""
    print(f"Loading FAISS index from {vectordb_path}...")

    # BGE 임베딩 모델 초기화
    embeddings = PromptedBGE(model_name="BAAI/bge-base-en")

    # FAISS 인덱스 로드
    vectorstore = FAISS.load_local(
        vectordb_path,
        embeddings,
        allow_dangerous_deserialization=True
    )

    print(f"   Total vectors: {vectorstore.index.ntotal}")

    return vectorstore


def test_queries(vectorstore, queries, k=5):
    """여러 쿼리에 대해 유사도 검색 테스트"""

    for i, query in enumerate(queries, 1):
        print(f"\n{'='*80}")
        print(f"Query {i}: {query}")
        print(f"{'='*80}")

        # 유사도 검색 (top-k)
        results = vectorstore.similarity_search(query, k=k)

        for j, doc in enumerate(results, 1):
            print(f"\n[Result {j}]")
            print(f"Title: {doc.metadata.get('title', 'N/A')}")
            print(f"URL: {doc.metadata.get('url', 'N/A')}")
            print(f"Content preview: {doc.page_content[:300]}...")
            print(f"-" * 80)


def test_with_scores(vectorstore, query, k=5):
    """유사도 점수와 함께 검색"""
    print(f"\n{'='*80}")
    print(f"Query with scores: {query}")
    print(f"{'='*80}")

    # 유사도 점수와 함께 검색
    results = vectorstore.similarity_search_with_score(query, k=k)

    for j, (doc, score) in enumerate(results, 1):
        print(f"\n[Result {j}] Score: {score:.4f}")
        print(f"Title: {doc.metadata.get('title', 'N/A')}")
        print(f"URL: {doc.metadata.get('url', 'N/A')}")
        print(f"Content preview: {doc.page_content[:200]}...")
        print(f"-" * 80)


if __name__ == "__main__":
    # VectorDB 경로
    VECTORDB_PATH = "./vectorDB"

    # FAISS 인덱스 로드
    vectorstore = load_vectorstore(VECTORDB_PATH)

    # 테스트 쿼리 목록
    test_queries_list = [
        "What is artificial intelligence?",
        "Who is Albert Einstein?",
        "How does photosynthesis work?",
        "What is the capital of France?",
        "Explain quantum mechanics",
    ]

    # 쿼리 테스트 (top-5 결과)
    test_queries(vectorstore, test_queries_list, k=5)

    # 점수와 함께 검색 테스트
    test_with_scores(vectorstore, "What is machine learning?", k=3)

    print("\n" + "="*80)
    print("="*80)


Loading FAISS index from ./vectorDB...
   Total vectors: 1120486

Query 1: What is artificial intelligence?

[Result 1]
Title: Artificial intelligence (disambiguation)
URL: https://en.wikipedia.org/wiki?curid=22264262
Content preview: Artificial intelligence is the intelligence exhibited by machines and software.
Artificial intelligence may also refer to:...
--------------------------------------------------------------------------------

[Result 2]
Title: Glossary of artificial intelligence
URL: https://en.wikipedia.org/wiki?curid=50336055
Content preview: This glossary of artificial intelligence is a list of definitions of terms and concepts relevant to the study of artificial intelligence (AI), its subdisciplines, and related fields. Related glossaries include Glossary of computer science, Glossary of robotics, and Glossary of machine vision.
C.
 
 ...
--------------------------------------------------------------------------------

[Result 3]
Title: Query language
URL: https://en.w