# 데이터 검색

이 노트북에서는 Azure AI Search의 다양한 검색 방법(키워드, 벡터, 하이브리드)을 수행합니다.

## 0. 필요한 라이브러리 설치

In [None]:
##############################################
# 0. 라이브러리 설치 (최초 1회 실행)
##############################################
%pip install --quiet --upgrade pip
%pip install --quiet python-dotenv requests azure-identity azure-search-documents openai
%pip install --quiet azure-search-documents --pre --upgrade

## 1. 환경 변수 설정

In [None]:
import os
from dotenv import load_dotenv

##############################################
# 1. 환경 변수 로드 및 확인
##############################################
load_dotenv()

# Azure AI Search
AZURE_SEARCH_SERVICE_ENDPOINT = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
AZURE_SEARCH_INDEX_NAME = os.getenv("AZURE_SEARCH_INDEX_NAME")

# Azure OpenAI
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-large")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-01")

# print(f"Search Endpoint: {AZURE_SEARCH_SERVICE_ENDPOINT}")
# print(f"Search Index: {AZURE_SEARCH_INDEX_NAME}")
# print(f"OpenAI Endpoint: {AZURE_OPENAI_ENDPOINT}")
# print(f"Embedding Model: {AZURE_OPENAI_EMBEDDING_DEPLOYMENT}")

## 2. 클라이언트 초기화

In [None]:
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents import SearchClient
from openai import AzureOpenAI
from IPython.display import display, Image, HTML
import os, shutil

##############################################
# 2-0. Azure 로그인을 위한 환경변수 등록 (az cli 경로 못찾는 경우)
##############################################
# AZ_DIR = "/opt/homebrew/bin" # 변경 필요
# os.environ["PATH"] = f"{AZ_DIR}:" + os.environ["PATH"]
# print("az which:", shutil.which("az"))

##############################################
# 2-1. Azure 인증 및 클라이언트 생성
##############################################
credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(
    credential,
    "https://cognitiveservices.azure.com/.default"
)

search_client = SearchClient(
    endpoint=AZURE_SEARCH_SERVICE_ENDPOINT,
    index_name=AZURE_SEARCH_INDEX_NAME,
    credential=credential
)

openai_client = AzureOpenAI(
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_ad_token_provider=token_provider
)

##############################################
# 2-2. 임베딩 생성 함수
##############################################
def get_embedding(text):
    response = openai_client.embeddings.create(
        input=text,
        model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT
    )
    return response.data[0].embedding

print("클라이언트 초기화 완료")

# 3. Analyzer 테스트

In [None]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import AnalyzeTextOptions

##############################################
# 3. Analyzer 테스트
##############################################

# 클라이언트 생성
index_client = SearchIndexClient(
    endpoint=AZURE_SEARCH_SERVICE_ENDPOINT,
    credential=credential
)

# Sample Query
query = "AI 기반 정밀 온도 제어 시스템애 대해서 알려줘"

# ko.microsoft
opts_ms = AnalyzeTextOptions(text=query, analyzer_name="ko.microsoft")
result_ms = index_client.analyze_text(AZURE_SEARCH_INDEX_NAME, opts_ms)
print("[ko.microsoft]", [t.token for t in result_ms.tokens])

# ko.lucene
opts_lu = AnalyzeTextOptions(text=query, analyzer_name="ko.lucene")
result_lu = index_client.analyze_text(AZURE_SEARCH_INDEX_NAME, opts_lu)
print("[ko.lucene]", [t.token for t in result_lu.tokens])

## 4. 키워드 검색 (Full-text Search)

In [None]:
##############################################
# 4. 키워드 검색
##############################################
search_query = "AI 기반 정밀 온도 제어 시스템애 대해서 알려줘"

results = search_client.search(
    search_text=search_query,
    top=3,
    query_type="full",
    search_fields=["title", "chunk", "header_1", "header_2", "header_3"],
)

print(f"검색어: '{search_query}'")
print(f"\n{'='*60}")
print("[키워드 검색 결과]")
print(f"{'='*60}\n")

for idx, result in enumerate(results, 1):
    print(f"결과 #{idx}")
    print(f"{'─'*60}")
    print(f"파일명: {result['title']}")
    print(f"Header 1: {result['header_1']}")
    print(f"Header 2: {result['header_2']}")
    print(f"Header 3: {result['header_3']}")
    print(f"\n내용:\n{result['chunk']}")
    print(f"{'='*60}\n")

## 5. 벡터 검색 (Vector Search)

In [None]:
from azure.search.documents.models import VectorizedQuery

##############################################
# 5. 벡터 검색
##############################################
search_query = "AI 기반 정밀 온도 제어 시스템애 대해서 알려줘"

# 검색어를 벡터로 변환
query_vector = get_embedding(search_query)

vector_query = VectorizedQuery(
    vector=query_vector,
    k=3,
    fields="text_vector"
)

results = search_client.search(
    search_text=None,
    vector_queries=[vector_query],
)

print(f"검색어: '{search_query}'")
print(f"\n{'='*60}")
print("[벡터 검색 결과]")
print(f"{'='*60}\n")

for idx, result in enumerate(results, 1):
    score = result['@search.score']
    print(f"결과 #{idx}")
    print(f"{'─'*60}")
    print(f"유사도 점수: {score:.4f}")
    print(f"파일명: {result['title']}")
    print(f"Header 1: {result['header_1']}")
    print(f"Header 2: {result['header_2']}")
    print(f"Header 3: {result['header_3']}")
    print(f"\n내용:\n{result['chunk']}")
    print(f"{'='*60}\n")

In [None]:
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.models import VectorizableTextQuery

##############################################
# 5. 벡터 검색 - Integrated vectorization 사용 (내부적으로 Vector 생성 후 쿼리))
##############################################
search_query = "AI 기반 정밀 온도 제어 시스템애 대해서 알려줘"

# 검색어를 벡터로 변환
query_vector = get_embedding(search_query)

vector_query = VectorizedQuery(
    vector=query_vector,
    k=3,
    fields="text_vector"
)

vectorizable_text_query = VectorizableTextQuery(
    text=search_query,        # 텍스트만 전달! (내부에서 자동으로 벡터화됨)
    fields="text_vector",     # 인덱스의 벡터 필드명
    k=3
)

results = search_client.search(
    search_text=None,
    vector_queries=[vectorizable_text_query],
)

print(f"검색어: '{search_query}'")
print(f"\n{'='*60}")
print("[벡터 검색 결과]")
print(f"{'='*60}\n")

for idx, result in enumerate(results, 1):
    score = result['@search.score']
    print(f"결과 #{idx}")
    print(f"{'─'*60}")
    print(f"유사도 점수: {score:.4f}")
    print(f"파일명: {result['title']}")
    print(f"Header 1: {result['header_1']}")
    print(f"Header 2: {result['header_2']}")
    print(f"Header 3: {result['header_3']}")
    print(f"\n내용:\n{result['chunk']}")
    print(f"{'='*60}\n")

## 6. 하이브리드 검색 (Hybrid Search)

In [None]:
##############################################
# 6. 하이브리드 검색 (키워드 + 벡터)
##############################################
search_query = "AI 기반 정밀 온도 제어 시스템애 대해서 알려줘"

# 검색어를 벡터로 변환
query_vector = get_embedding(search_query)

vector_query = VectorizedQuery(
    vector=query_vector,
    k_nearest_neighbors=3,
    fields="text_vector"
)

results = search_client.search(
    search_text=search_query,
    vector_queries=[vector_query],
    top=3,
)

print(f"검색어: '{search_query}'")
print(f"\n{'='*60}")
print("[벡터 검색 결과]")
print(f"{'='*60}\n")

for idx, result in enumerate(results, 1):
    score = result['@search.score']
    print(f"결과 #{idx}")
    print(f"{'─'*60}")
    print(f"통합 점수: {score:.4f}")
    print(f"파일명: {result['title']}")
    print(f"Header 1: {result['header_1']}")
    print(f"Header 2: {result['header_2']}")
    print(f"Header 3: {result['header_3']}")
    print(f"\n내용:\n{result['chunk']}")
    print(f"   설명 벡터 (처음 5개 값): {result['text_vector'][:5]}...")
    
    print(f"{'='*60}\n")