In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96
interactivity = ''
is_shiny = False
is_dashboard = False
plotly_connected = True

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  if plotly_connected:
    pio.renderers.default = "notebook_connected"
  else:
    pio.renderers.default = "notebook"
  for template in pio.templates.keys():
    pio.templates[template].layout.margin = dict(t=30,r=0,b=0,l=0)
except Exception:
  pass

# disable itables paging for dashboards
if is_dashboard:
  try:
    from itables import options
    options.dom = 'fiBrtlp'
    options.maxBytes = 1024 * 1024
    options.language = dict(info = "Showing _TOTAL_ entries")
    options.classes = "display nowrap compact"
    options.paging = False
    options.searching = True
    options.ordering = True
    options.info = True
    options.lengthChange = False
    options.autoWidth = False
    options.responsive = True
    options.keys = True
    options.buttons = []
  except Exception:
    pass
  
  try:
    import altair as alt
    # By default, dashboards will have container sized
    # vega visualizations which allows them to flow reasonably
    theme_sentinel = '_quarto-dashboard-internal'
    def make_theme(name):
        nonTheme = alt.themes._plugins[name]    
        def patch_theme(*args, **kwargs):
            existingTheme = nonTheme()
            if 'height' not in existingTheme:
              existingTheme['height'] = 'container'
            if 'width' not in existingTheme:
              existingTheme['width'] = 'container'

            if 'config' not in existingTheme:
              existingTheme['config'] = dict()
            
            # Configure the default font sizes
            title_font_size = 15
            header_font_size = 13
            axis_font_size = 12
            legend_font_size = 12
            mark_font_size = 12
            tooltip = False

            config = existingTheme['config']

            # The Axis
            if 'axis' not in config:
              config['axis'] = dict()
            axis = config['axis']
            if 'labelFontSize' not in axis:
              axis['labelFontSize'] = axis_font_size
            if 'titleFontSize' not in axis:
              axis['titleFontSize'] = axis_font_size  

            # The legend
            if 'legend' not in config:
              config['legend'] = dict()
            legend = config['legend']
            if 'labelFontSize' not in legend:
              legend['labelFontSize'] = legend_font_size
            if 'titleFontSize' not in legend:
              legend['titleFontSize'] = legend_font_size  

            # The header
            if 'header' not in config:
              config['header'] = dict()
            header = config['header']
            if 'labelFontSize' not in header:
              header['labelFontSize'] = header_font_size
            if 'titleFontSize' not in header:
              header['titleFontSize'] = header_font_size    

            # Title
            if 'title' not in config:
              config['title'] = dict()
            title = config['title']
            if 'fontSize' not in title:
              title['fontSize'] = title_font_size

            # Marks
            if 'mark' not in config:
              config['mark'] = dict()
            mark = config['mark']
            if 'fontSize' not in mark:
              mark['fontSize'] = mark_font_size

            # Mark tooltips
            if tooltip and 'tooltip' not in mark:
              mark['tooltip'] = dict(content="encoding")

            return existingTheme
            
        return patch_theme

    # We can only do this once per session
    if theme_sentinel not in alt.themes.names():
      for name in alt.themes.names():
        alt.themes.register(name, make_theme(name))
      
      # register a sentinel theme so we only do this once
      alt.themes.register(theme_sentinel, make_theme('default'))
      alt.themes.enable('default')

  except Exception:
    pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass

# interactivity
if interactivity:
  from IPython.core.interactiveshell import InteractiveShell
  InteractiveShell.ast_node_interactivity = interactivity

# NOTE: the kernel_deps code is repeated in the cleanup.py file
# (we can't easily share this code b/c of the way it is run).
# If you edit this code also edit the same code in cleanup.py!

# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\Agent\09-VectorStore':
  os.chdir(r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\Agent\09-VectorStore')

# reset state
%reset

# shiny
# Checking for shiny by using False directly because we're after the %reset. We don't want
# to set a variable that stays in global scope.
if False:
  try:
    import htmltools as _htmltools
    import ast as _ast

    _htmltools.html_dependency_render_mode = "json"

    # This decorator will be added to all function definitions
    def _display_if_has_repr_html(x):
      try:
        # IPython 7.14 preferred import
        from IPython.display import display, HTML
      except:
        from IPython.core.display import display, HTML

      if hasattr(x, '_repr_html_'):
        display(HTML(x._repr_html_()))
      return x

    # ideally we would undo the call to ast_transformers.append
    # at the end of this block whenver an error occurs, we do 
    # this for now as it will only be a problem if the user 
    # switches from shiny to not-shiny mode (and even then likely
    # won't matter)
    import builtins
    builtins._display_if_has_repr_html = _display_if_has_repr_html

    class _FunctionDefReprHtml(_ast.NodeTransformer):
      def visit_FunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

      def visit_AsyncFunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

    ip = get_ipython()
    ip.ast_transformers.append(_FunctionDefReprHtml())

  except:
    pass

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v

  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define


In [2]:
# Azure AI Search SDK 설치
# !pip install azure-search-documents
# !pip install azure-identity
# !pip install langchain-community
# !pip install langchain-openai

In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

# 환경 변수 설정
AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
AZURE_SEARCH_API_KEY = os.getenv("AZURE_SEARCH_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")

In [4]:
from langchain_openai import AzureOpenAIEmbeddings

# Azure OpenAI 임베딩 모델 설정
embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-ada-002",
    openai_api_version="2024-02-01",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY
)

# 임베딩 테스트
test_embedding = embeddings.embed_query("테스트 문장")
print(f"임베딩 차원: {len(test_embedding)}")

In [5]:
from langchain_community.vectorstores.azuresearch import AzureSearch

# Azure Search VectorStore 초기화
vector_store = AzureSearch(
    azure_search_endpoint=AZURE_SEARCH_ENDPOINT,
    azure_search_key=AZURE_SEARCH_API_KEY,
    index_name="langchain-vector-demo",
    embedding_function=embeddings.embed_query
)

print("Azure AI Search VectorStore 초기화 완료")

In [6]:
from langchain_core.documents import Document

# 샘플 문서 생성
documents = [
    Document(
        page_content="Azure AI Search는 Microsoft Azure의 관리형 검색 서비스다.",
        metadata={"source": "azure-docs", "category": "search", "year": 2024}
    ),
    Document(
        page_content="벡터 검색과 전문 검색을 결합한 하이브리드 검색을 지원한다.",
        metadata={"source": "azure-docs", "category": "features", "year": 2024}
    ),
    Document(
        page_content="Azure OpenAI Service와 통합하여 RAG 시스템을 구축할 수 있다.",
        metadata={"source": "azure-docs", "category": "ai", "year": 2024}
    ),
    Document(
        page_content="엔터프라이즈급 보안과 규정 준수 기능을 제공한다.",
        metadata={"source": "azure-docs", "category": "security", "year": 2024}
    ),
    Document(
        page_content="자동 스케일링과 고가용성을 지원하는 클라우드 네이티브 서비스다.",
        metadata={"source": "azure-docs", "category": "infrastructure", "year": 2024}
    )
]

# 문서 추가
ids = vector_store.add_documents(documents)
print(f"{len(documents)}개 문서가 인덱스에 추가되었다.")
print(f"문서 IDs: {ids}")

In [7]:
# 기본 유사도 검색
query = "Azure에서 RAG 시스템을 만드는 방법은?"
results = vector_store.similarity_search(query, k=3)

print(f"\n질문: {query}\n")
for i, doc in enumerate(results, 1):
    print(f"[결과 {i}]")
    print(f"내용: {doc.page_content}")
    print(f"메타데이터: {doc.metadata}\n")

In [8]:
# 유사도 점수 포함 검색
results_with_score = vector_store.similarity_search_with_score(query, k=3)

print(f"질문: {query}\n")
for doc, score in results_with_score:
    print(f"점수: {score:.4f}")
    print(f"내용: {doc.page_content}")
    print(f"카테고리: {doc.metadata.get('category', 'N/A')}\n")

In [9]:
from azure.search.documents.models import VectorizedQuery

# 하이브리드 검색 (벡터 + 키워드)
query = "Azure 보안 기능"
results = vector_store.hybrid_search(
    query=query,
    k=5
)

print(f"하이브리드 검색 결과:\n")
for i, doc in enumerate(results, 1):
    print(f"[{i}] {doc.page_content}")
    print(f"    카테고리: {doc.metadata.get('category')}\n")

In [10]:
# 벡터 검색 가중치 조정
# alpha = 1.0: 벡터 검색 100%
# alpha = 0.0: 키워드 검색 100%
# alpha = 0.5: 50:50 (기본값)

query = "검색 서비스"

# 벡터 검색 위주
print("=== 벡터 검색 위주 (alpha=0.8) ===")
results_vector = vector_store.hybrid_search(query, k=3, alpha=0.8)
for doc in results_vector:
    print(f"- {doc.page_content[:50]}...")

# 키워드 검색 위주
print("\n=== 키워드 검색 위주 (alpha=0.2) ===")
results_keyword = vector_store.hybrid_search(query, k=3, alpha=0.2)
for doc in results_keyword:
    print(f"- {doc.page_content[:50]}...")

In [11]:
# 특정 카테고리의 문서만 검색
query = "Azure 기능"
results = vector_store.similarity_search(
    query=query,
    k=5,
    filters="category eq 'features' or category eq 'ai'"
)

print("필터링된 검색 결과:")
for doc in results:
    print(f"- {doc.page_content}")
    print(f"  카테고리: {doc.metadata['category']}\n")

In [12]:
# 복잡한 필터 조건
results = vector_store.similarity_search(
    query="Azure",
    k=10,
    filters="category eq 'search' and year eq 2024"
)

print(f"검색 조건: category='search' AND year=2024")
print(f"결과 수: {len(results)}")

In [13]:
from langchain_openai import AzureChatOpenAI
from langchain.chains import RetrievalQA

# Azure OpenAI Chat 모델 설정
llm = AzureChatOpenAI(
    azure_deployment="gpt-4",
    openai_api_version="2024-02-01",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    temperature=0
)

# Retriever 생성 (하이브리드 검색 사용)
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

# RAG 체인 생성
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

print("RAG 시스템 준비 완료")

In [14]:
# 질문하기
question = "Azure AI Search의 주요 특징은 무엇인가?"
result = qa_chain.invoke({"query": question})

print(f"질문: {question}\n")
print(f"답변:\n{result['result']}\n")
print("참조 문서:")
for i, doc in enumerate(result["source_documents"], 1):
    print(f"[{i}] {doc.page_content}")
    print(f"    출처: {doc.metadata.get('source')}, 카테고리: {doc.metadata.get('category')}\n")

In [15]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

# 대화 메모리 설정
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key="answer"
)

# 대화형 RAG 체인
conversational_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True
)

# 대화 시작
questions = [
    "Azure AI Search가 무엇인가?",
    "하이브리드 검색이란?",
    "Azure와 어떻게 통합되나?"
]

for question in questions:
    result = conversational_chain.invoke({"question": question})
    print(f"Q: {question}")
    print(f"A: {result['answer']}\n")

In [16]:
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticField,
    SemanticSearch
)

# 필드 정의
fields = [
    SearchField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True
    ),
    SearchField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
        analyzer_name="ko.microsoft"  # 한글 형태소 분석기
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        vector_search_dimensions=1536,  # Ada-002 차원
        vector_search_profile_name="myHnswProfile"
    ),
    SearchField(
        name="category",
        type=SearchFieldDataType.String,
        filterable=True,
        facetable=True
    ),
    SearchField(
        name="year",
        type=SearchFieldDataType.Int32,
        filterable=True,
        sortable=True
    )
]

# 벡터 검색 설정 (HNSW 알고리즘)
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            parameters={
                "m": 4,  # 연결 수
                "efConstruction": 400,  # 구축 시 탐색 깊이
                "efSearch": 500,  # 검색 시 탐색 깊이
                "metric": "cosine"  # 유사도 메트릭
            }
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw"
        )
    ]
)

# 시맨틱 검색 설정
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticField(
        title_field=None,
        content_fields=[SemanticField(field_name="content")],
        keywords_fields=[SemanticField(field_name="category")]
    )
)

semantic_search = SemanticSearch(configurations=[semantic_config])

In [17]:
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential

# 인덱스 클라이언트
index_client = SearchIndexClient(
    endpoint=AZURE_SEARCH_ENDPOINT,
    credential=AzureKeyCredential(AZURE_SEARCH_API_KEY)
)

# 인덱스 생성
index = SearchIndex(
    name="advanced-index",
    fields=fields,
    vector_search=vector_search,
    semantic_search=semantic_search
)

result = index_client.create_or_update_index(index)
print(f"인덱스 '{result.name}' 생성 완료")

In [18]:
# 모든 인덱스 조회
indexes = index_client.list_indexes()
print("현재 인덱스 목록:")
for idx in indexes:
    print(f"- {idx.name}")

# 특정 인덱스 정보 조회
index_info = index_client.get_index("advanced-index")
print(f"\n인덱스 정보:")
print(f"이름: {index_info.name}")
print(f"필드 수: {len(index_info.fields)}")

# 인덱스 삭제
# index_client.delete_index("advanced-index")
# print("인덱스 삭제 완료")

In [19]:
from azure.search.documents.models import QueryType

# 시맨틱 검색 실행
results = vector_store.similarity_search(
    query="클라우드 기반 검색 솔루션의 보안 기능",
    k=5,
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name="my-semantic-config"
)

print("시맨틱 검색 결과:")
for i, doc in enumerate(results, 1):
    print(f"\n[{i}] {doc.page_content}")
    print(f"카테고리: {doc.metadata.get('category')}")

In [20]:
from azure.search.documents import SearchClient

# Search 클라이언트 생성
search_client = SearchClient(
    endpoint=AZURE_SEARCH_ENDPOINT,
    index_name="langchain-vector-demo",
    credential=AzureKeyCredential(AZURE_SEARCH_API_KEY)
)

# 대량 문서 업로드 (배치 단위)
batch_size = 100
documents_batch = []

for doc in large_document_list:
    documents_batch.append(doc)
    
    if len(documents_batch) >= batch_size:
        # 배치 업로드
        result = search_client.upload_documents(documents=documents_batch)
        print(f"{len(documents_batch)}개 문서 업로드 완료")
        documents_batch = []

# 남은 문서 업로드
if documents_batch:
    search_client.upload_documents(documents=documents_batch)

In [21]:
# 인덱서 설정으로 자동 인덱싱
from azure.search.documents.indexes.models import (
    SearchIndexer,
    IndexingSchedule,
    FieldMapping
)

# 데이터 소스 연결 (예: Blob Storage)
indexer = SearchIndexer(
    name="my-indexer",
    data_source_name="my-datasource",
    target_index_name="my-index",
    schedule=IndexingSchedule(interval="PT2H"),  # 2시간마다
    field_mappings=[
        FieldMapping(source_field_name="content", target_field_name="content")
    ]
)

# 인덱서 생성 및 실행
# indexer_client.create_or_update_indexer(indexer)
# indexer_client.run_indexer("my-indexer")

In [22]:
from azure.identity import DefaultAzureCredential

# Managed Identity로 인증 (권장)
credential = DefaultAzureCredential()

vector_store = AzureSearch(
    azure_search_endpoint=AZURE_SEARCH_ENDPOINT,
    azure_search_credential=credential,  # 키 대신 credential
    index_name="secure-index",
    embedding_function=embeddings.embed_query
)

print("Managed Identity로 인증 완료")

In [23]:
def analyze_search_quality(test_queries, expected_results):
    """검색 품질 메트릭 계산"""
    metrics = {
        "precision": [],
        "recall": [],
        "mrr": []  # Mean Reciprocal Rank
    }
    
    for query, expected in zip(test_queries, expected_results):
        results = vector_store.similarity_search(query, k=10)
        result_ids = [doc.metadata.get("id") for doc in results]
        
        # Precision@K
        relevant_in_top_k = len(set(result_ids[:5]) & set(expected))
        precision = relevant_in_top_k / 5
        metrics["precision"].append(precision)
        
        # Recall
        total_relevant = len(set(result_ids) & set(expected))
        recall = total_relevant / len(expected) if expected else 0
        metrics["recall"].append(recall)
        
        # MRR
        for i, doc_id in enumerate(result_ids, 1):
            if doc_id in expected:
                metrics["mrr"].append(1/i)
                break
        else:
            metrics["mrr"].append(0)
    
    return {
        "avg_precision": sum(metrics["precision"]) / len(metrics["precision"]),
        "avg_recall": sum(metrics["recall"]) / len(metrics["recall"]),
        "avg_mrr": sum(metrics["mrr"]) / len(metrics["mrr"])
    }

# 사용 예시
# test_queries = ["Azure 검색", "RAG 시스템", "보안 기능"]
# expected = [["doc1", "doc2"], ["doc3", "doc4"], ["doc5"]]
# quality_metrics = analyze_search_quality(test_queries, expected)