In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96
interactivity = ''
is_shiny = False
is_dashboard = False
plotly_connected = True

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  if plotly_connected:
    pio.renderers.default = "notebook_connected"
  else:
    pio.renderers.default = "notebook"
  for template in pio.templates.keys():
    pio.templates[template].layout.margin = dict(t=30,r=0,b=0,l=0)
except Exception:
  pass

# disable itables paging for dashboards
if is_dashboard:
  try:
    from itables import options
    options.dom = 'fiBrtlp'
    options.maxBytes = 1024 * 1024
    options.language = dict(info = "Showing _TOTAL_ entries")
    options.classes = "display nowrap compact"
    options.paging = False
    options.searching = True
    options.ordering = True
    options.info = True
    options.lengthChange = False
    options.autoWidth = False
    options.responsive = True
    options.keys = True
    options.buttons = []
  except Exception:
    pass
  
  try:
    import altair as alt
    # By default, dashboards will have container sized
    # vega visualizations which allows them to flow reasonably
    theme_sentinel = '_quarto-dashboard-internal'
    def make_theme(name):
        nonTheme = alt.themes._plugins[name]    
        def patch_theme(*args, **kwargs):
            existingTheme = nonTheme()
            if 'height' not in existingTheme:
              existingTheme['height'] = 'container'
            if 'width' not in existingTheme:
              existingTheme['width'] = 'container'

            if 'config' not in existingTheme:
              existingTheme['config'] = dict()
            
            # Configure the default font sizes
            title_font_size = 15
            header_font_size = 13
            axis_font_size = 12
            legend_font_size = 12
            mark_font_size = 12
            tooltip = False

            config = existingTheme['config']

            # The Axis
            if 'axis' not in config:
              config['axis'] = dict()
            axis = config['axis']
            if 'labelFontSize' not in axis:
              axis['labelFontSize'] = axis_font_size
            if 'titleFontSize' not in axis:
              axis['titleFontSize'] = axis_font_size  

            # The legend
            if 'legend' not in config:
              config['legend'] = dict()
            legend = config['legend']
            if 'labelFontSize' not in legend:
              legend['labelFontSize'] = legend_font_size
            if 'titleFontSize' not in legend:
              legend['titleFontSize'] = legend_font_size  

            # The header
            if 'header' not in config:
              config['header'] = dict()
            header = config['header']
            if 'labelFontSize' not in header:
              header['labelFontSize'] = header_font_size
            if 'titleFontSize' not in header:
              header['titleFontSize'] = header_font_size    

            # Title
            if 'title' not in config:
              config['title'] = dict()
            title = config['title']
            if 'fontSize' not in title:
              title['fontSize'] = title_font_size

            # Marks
            if 'mark' not in config:
              config['mark'] = dict()
            mark = config['mark']
            if 'fontSize' not in mark:
              mark['fontSize'] = mark_font_size

            # Mark tooltips
            if tooltip and 'tooltip' not in mark:
              mark['tooltip'] = dict(content="encoding")

            return existingTheme
            
        return patch_theme

    # We can only do this once per session
    if theme_sentinel not in alt.themes.names():
      for name in alt.themes.names():
        alt.themes.register(name, make_theme(name))
      
      # register a sentinel theme so we only do this once
      alt.themes.register(theme_sentinel, make_theme('default'))
      alt.themes.enable('default')

  except Exception:
    pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass

# interactivity
if interactivity:
  from IPython.core.interactiveshell import InteractiveShell
  InteractiveShell.ast_node_interactivity = interactivity

# NOTE: the kernel_deps code is repeated in the cleanup.py file
# (we can't easily share this code b/c of the way it is run).
# If you edit this code also edit the same code in cleanup.py!

# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\RAG\13-Cloud-RAG':
  os.chdir(r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\RAG\13-Cloud-RAG')

# reset state
%reset

# shiny
# Checking for shiny by using False directly because we're after the %reset. We don't want
# to set a variable that stays in global scope.
if False:
  try:
    import htmltools as _htmltools
    import ast as _ast

    _htmltools.html_dependency_render_mode = "json"

    # This decorator will be added to all function definitions
    def _display_if_has_repr_html(x):
      try:
        # IPython 7.14 preferred import
        from IPython.display import display, HTML
      except:
        from IPython.core.display import display, HTML

      if hasattr(x, '_repr_html_'):
        display(HTML(x._repr_html_()))
      return x

    # ideally we would undo the call to ast_transformers.append
    # at the end of this block whenver an error occurs, we do 
    # this for now as it will only be a problem if the user 
    # switches from shiny to not-shiny mode (and even then likely
    # won't matter)
    import builtins
    builtins._display_if_has_repr_html = _display_if_has_repr_html

    class _FunctionDefReprHtml(_ast.NodeTransformer):
      def visit_FunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

      def visit_AsyncFunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

    ip = get_ipython()
    ip.ast_transformers.append(_FunctionDefReprHtml())

  except:
    pass

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v

  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define


In [2]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch
)
from azure.core.credentials import AzureKeyCredential
from dotenv import load_dotenv
import os

load_dotenv()

# 클라이언트 생성
index_client = SearchIndexClient(
    endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("AZURE_SEARCH_API_KEY"))
)

# 인덱스 스키마 정의
fields = [
    SearchField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,  # 기본 키
        filterable=True
    ),
    SearchField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,  # 전체 텍스트 검색 가능
        filterable=False
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        vector_search_dimensions=1536,  # text-embedding-3-small
        vector_search_profile_name="vector-profile"
    ),
    SearchField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=False,
        filterable=True
    ),
    SearchField(
        name="source",
        type=SearchFieldDataType.String,
        searchable=False,
        filterable=True,
        facetable=True
    )
]

# 벡터 검색 설정
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="hnsw-config",
            parameters={
                "m": 4,  # 그래프 연결 수
                "efConstruction": 400,  # 인덱스 구축 품질
                "efSearch": 500,  # 검색 정확도
                "metric": "cosine"  # 유사도 측정 방식
            }
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="vector-profile",
            algorithm_configuration_name="hnsw-config"
        )
    ]
)

# 시맨틱 검색 설정 (선택 사항)
semantic_config = SemanticConfiguration(
    name="semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=None,
        content_fields=[SemanticField(field_name="content")]
    )
)

semantic_search = SemanticSearch(
    configurations=[semantic_config]
)

# 인덱스 생성
index = SearchIndex(
    name=os.getenv("AZURE_SEARCH_INDEX_NAME"),
    fields=fields,
    vector_search=vector_search,
    semantic_search=semantic_search
)

# 인덱스 업로드
result = index_client.create_or_update_index(index)
print(f"인덱스 생성 완료: {result.name}")

In [3]:
from azure.search.documents import SearchClient
from langchain_openai import AzureOpenAIEmbeddings

# Embeddings 클라이언트
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT"),
    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY")
)

# Search 클라이언트
search_client = SearchClient(
    endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
    index_name=os.getenv("AZURE_SEARCH_INDEX_NAME"),
    credential=AzureKeyCredential(os.getenv("AZURE_SEARCH_API_KEY"))
)

# 문서 준비
documents = [
    {
        "id": "1",
        "content": "Azure AI Search는 Microsoft의 관리형 검색 서비스다.",
        "source": "doc1.txt",
        "metadata": "Azure"
    },
    {
        "id": "2",
        "content": "RAG는 검색 증강 생성 기술이다.",
        "source": "doc2.txt",
        "metadata": "RAG"
    },
    {
        "id": "3",
        "content": "임베딩은 텍스트를 벡터로 변환한다.",
        "source": "doc3.txt",
        "metadata": "Embeddings"
    }
]

# 임베딩 생성 및 문서 업로드
for doc in documents:
    # 임베딩 생성
    doc["content_vector"] = embeddings.embed_query(doc["content"])

# 배치 업로드
result = search_client.upload_documents(documents=documents)
print(f"업로드 완료: {len(result)}개 문서")

# 각 문서 결과 확인
for item in result:
    print(f"문서 ID {item.key}: {item.succeeded}")

In [4]:
from langchain_community.vectorstores.azuresearch import AzureSearch

# Azure Search VectorStore 초기화
vector_store = AzureSearch(
    azure_search_endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
    azure_search_key=os.getenv("AZURE_SEARCH_API_KEY"),
    index_name=os.getenv("AZURE_SEARCH_INDEX_NAME"),
    embedding_function=embeddings.embed_query
)

# 문서 추가 (자동 임베딩)
from langchain_core.documents import Document

docs = [
    Document(
        page_content="LangChain은 LLM 애플리케이션 프레임워크다.",
        metadata={"source": "langchain.txt", "category": "framework"}
    ),
    Document(
        page_content="Azure OpenAI는 엔터프라이즈급 LLM 서비스다.",
        metadata={"source": "azure_openai.txt", "category": "llm"}
    )
]

# 자동으로 임베딩 생성 및 업로드
vector_store.add_documents(docs)
print(f"{len(docs)}개 문서 추가 완료")

In [5]:
# 유사도 검색
query = "LLM 프레임워크는 무엇인가?"
results = vector_store.similarity_search(query, k=3)

print(f"검색 결과 ({len(results)}개):\n")
for i, doc in enumerate(results, 1):
    print(f"{i}. {doc.page_content}")
    print(f"   메타데이터: {doc.metadata}\n")

In [6]:
# 유사도 점수 포함 검색
results_with_scores = vector_store.similarity_search_with_relevance_scores(
    query="Azure 검색 서비스",
    k=3
)

print("유사도 점수:\n")
for doc, score in results_with_scores:
    print(f"점수: {score:.4f}")
    print(f"내용: {doc.page_content}")
    print(f"출처: {doc.metadata.get('source', 'N/A')}\n")

In [7]:
from azure.search.documents.models import VectorizedQuery

# 검색 클라이언트
search_client = SearchClient(
    endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
    index_name=os.getenv("AZURE_SEARCH_INDEX_NAME"),
    credential=AzureKeyCredential(os.getenv("AZURE_SEARCH_API_KEY"))
)

# 쿼리
query_text = "Azure 검색 서비스"
query_vector = embeddings.embed_query(query_text)

# 벡터 쿼리 정의
vector_query = VectorizedQuery(
    vector=query_vector,
    k_nearest_neighbors=50,  # 벡터 검색 결과 수
    fields="content_vector"
)

# 하이브리드 검색 실행
results = search_client.search(
    search_text=query_text,  # 키워드 검색
    vector_queries=[vector_query],  # 벡터 검색
    select=["id", "content", "source", "metadata"],
    top=5
)

print("하이브리드 검색 결과:\n")
for result in results:
    print(f"ID: {result['id']}")
    print(f"점수: {result['@search.score']:.4f}")
    print(f"내용: {result['content']}")
    print(f"출처: {result['source']}\n")

In [8]:
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType

# 시맨틱 검색
results = search_client.search(
    search_text="Azure에서 제공하는 검색 서비스는 무엇인가?",
    vector_queries=[vector_query],
    query_type=QueryType.SEMANTIC,  # 시맨틱 검색 활성화
    semantic_configuration_name="semantic-config",
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=3
)

print("시맨틱 검색 결과:\n")
for result in results:
    print(f"내용: {result['content']}")
    
    # 캡션 (관련 부분 하이라이트)
    if '@search.captions' in result:
        captions = result['@search.captions']
        if captions:
            print(f"캡션: {captions[0].text}")
    
    print(f"점수: {result['@search.score']:.4f}\n")

In [9]:
# 메타데이터 필터링
results = vector_store.similarity_search(
    query="Azure 서비스",
    k=5,
    filters="metadata eq 'Azure'"  # OData 필터 구문
)

print(f"필터링된 결과 ({len(results)}개):")
for doc in results:
    print(f"- {doc.page_content}")
    print(f"  메타데이터: {doc.metadata}\n")

In [10]:
# AND/OR 조건
filter_expression = "metadata eq 'Azure' and source eq 'doc1.txt'"

results = search_client.search(
    search_text="검색",
    filter=filter_expression,
    top=5
)

for result in results:
    print(f"- {result['content']}")

In [11]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

# Retriever 생성
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

# LLM
llm = AzureChatOpenAI(
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    temperature=0
)

# 프롬프트
prompt = ChatPromptTemplate.from_template(
    """다음 컨텍스트를 참고하여 질문에 답변하세요.

컨텍스트:
{context}

질문: {question}

답변:"""
)

# RAG 체인
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# 실행
answer = rag_chain.invoke("Azure AI Search는 무엇인가?")
print(f"답변: {answer}")

In [12]:
def upload_documents_batch(documents, batch_size=1000):
    """배치 단위로 대량 문서 업로드"""
    total_uploaded = 0
    
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i+batch_size]
        
        # 임베딩 생성
        for doc in batch:
            if "content_vector" not in doc:
                doc["content_vector"] = embeddings.embed_query(doc["content"])
        
        # 업로드
        result = search_client.upload_documents(documents=batch)
        
        uploaded = sum(1 for item in result if item.succeeded)
        total_uploaded += uploaded
        
        print(f"배치 {i//batch_size + 1}: {uploaded}/{len(batch)} 업로드")
    
    return total_uploaded

# 사용 예시
# large_docs = [{"id": str(i), "content": f"문서 {i}"} for i in range(5000)]
# total = upload_documents_batch(large_docs, batch_size=1000)
# print(f"총 {total}개 문서 업로드 완료")

In [13]:
# 인덱스 삭제
index_client.delete_index(os.getenv("AZURE_SEARCH_INDEX_NAME"))
print("인덱스 삭제 완료")

In [14]:
# 모든 인덱스 조회
indexes = index_client.list_indexes()

print("인덱스 목록:")
for index in indexes:
    print(f"- {index.name}")

In [15]:
# 특정 문서 삭제
search_client.delete_documents(documents=[{"id": "1"}, {"id": "2"}])
print("문서 삭제 완료")

In [16]:
# 인덱스 통계 조회
stats = search_client.get_document_count()
print(f"총 문서 수: {stats}")

In [17]:
import time

# 검색 성능 측정
start_time = time.time()

results = vector_store.similarity_search("테스트 쿼리", k=10)

duration = time.time() - start_time
print(f"검색 시간: {duration:.3f}초")
print(f"검색 결과: {len(results)}개")