# 데이터 검색

이 노트북에서는 Azure AI Search의 다양한 검색 방법(키워드, 벡터, 하이브리드)을 수행합니다.

## 0. 필요한 라이브러리 설치

In [None]:
##############################################
# 0. 라이브러리 설치 (최초 1회 실행)
##############################################
%pip install --quiet python-dotenv azure-identity azure-search-documents openai

## 1. 환경 변수 설정

In [None]:
import os
from dotenv import load_dotenv

##############################################
# 1. 환경 변수 로드 및 확인
##############################################
load_dotenv()

# Azure AI Search
AZURE_SEARCH_SERVICE_ENDPOINT = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
AZURE_SEARCH_INDEX_NAME = os.getenv("AZURE_SEARCH_INDEX_NAME")

# Azure OpenAI
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-large")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-01")

print(f"Search Endpoint: {AZURE_SEARCH_SERVICE_ENDPOINT}")
print(f"Search Index: {AZURE_SEARCH_INDEX_NAME}")
print(f"OpenAI Endpoint: {AZURE_OPENAI_ENDPOINT}")
print(f"Embedding Model: {AZURE_OPENAI_EMBEDDING_DEPLOYMENT}")

## 2. 클라이언트 초기화

In [None]:
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents import SearchClient
from openai import AzureOpenAI
from IPython.display import display, Image, HTML

##############################################
# 2-1. Azure 인증 및 클라이언트 생성
##############################################
credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(
    credential,
    "https://cognitiveservices.azure.com/.default"
)

search_client = SearchClient(
    endpoint=AZURE_SEARCH_SERVICE_ENDPOINT,
    index_name=AZURE_SEARCH_INDEX_NAME,
    credential=credential
)

openai_client = AzureOpenAI(
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_ad_token_provider=token_provider
)

##############################################
# 2-2. 임베딩 생성 함수
##############################################
def get_embedding(text):
    response = openai_client.embeddings.create(
        input=text,
        model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT
    )
    return response.data[0].embedding

print("클라이언트 초기화 완료")

# 3. Analyzer 테스트

In [None]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import AnalyzeTextOptions

##############################################
# 3. Analyzer 테스트
##############################################

# 클라이언트 생성
index_client = SearchIndexClient(
    endpoint=AZURE_SEARCH_SERVICE_ENDPOINT,
    credential=credential
)

# Sample Query
query = "튼튼하게 만들어진 백팩으로, 컴퓨터를 넣을 수 있어야함."

# ko.microsoft
opts_ms = AnalyzeTextOptions(text=query, analyzer_name="ko.microsoft")
result_ms = index_client.analyze_text(AZURE_SEARCH_INDEX_NAME, opts_ms)
print("[ko.microsoft]", [t.token for t in result_ms.tokens])

# ko.lucene
opts_lu = AnalyzeTextOptions(text=query, analyzer_name="ko.lucene")
result_lu = index_client.analyze_text(AZURE_SEARCH_INDEX_NAME, opts_lu)
print("[ko.lucene]", [t.token for t in result_lu.tokens])

## 4. 키워드 검색 (Full-text Search)

In [None]:
##############################################
# 4. 키워드 검색
##############################################
search_query = "튼튼하고 편안한 가방으로, 컴퓨터를 넣을 수 있어야함."

results = search_client.search(
    search_text=search_query,
    top=5,
    query_type="full",
    search_fields=["name", "description"],
    select=["id", "name", "brand", "price", "description", "imageUrl"]
)

print(f"검색어: '{search_query}'")
print(f"\n{'='*60}")
print("[키워드 검색 결과]")
print(f"{'='*60}\n")

for idx, result in enumerate(results, 1):
    print(f"{idx}. {result['name']} ({result['brand']})")
    print(f"   설명: {result['description']}")
    display(HTML(f'<img src="{result["imageUrl"]}" width="200" />'))
    print()

## 5. 벡터 검색 (Vector Search)

In [None]:
from azure.search.documents.models import VectorizedQuery

##############################################
# 5. 벡터 검색
##############################################
search_query = "튼튼하고 편안한 가방으로, 컴퓨터를 넣을 수 있어야함."

# 검색어를 벡터로 변환
query_vector = get_embedding(search_query)

vector_query = VectorizedQuery(
    vector=query_vector,
    k_nearest_neighbors=5,
    fields="descriptionVector"
)

results = search_client.search(
    search_text=None,
    vector_queries=[vector_query],
    select=["id", "name", "brand", "price", "description", "imageUrl", "descriptionVector"]
)

print(f"검색어: '{search_query}'")
print(f"\n{'='*60}")
print("[벡터 검색 결과]")
print(f"{'='*60}\n")

for idx, result in enumerate(results, 1):
    score = result['@search.score']
    print(f"{idx}. {result['name']} ({result['brand']})")
    print(f"   유사도 점수: {score:.4f}")
    print(f"   설명: {result['description']}")
    print(f"   설명 벡터 (처음 5개 값): {result['descriptionVector'][:5]}...")
    display(HTML(f'<img src="{result["imageUrl"]}" width="200" />'))
    print()

## 6. 하이브리드 검색 (Hybrid Search)

In [None]:
##############################################
# 6. 하이브리드 검색 (키워드 + 벡터)
##############################################
search_query = "튼튼하고 편안한 가방으로, 컴퓨터를 넣을 수 있어야함."

# 검색어를 벡터로 변환
query_vector = get_embedding(search_query)

vector_query = VectorizedQuery(
    vector=query_vector,
    k_nearest_neighbors=5,
    fields="descriptionVector"
)

results = search_client.search(
    search_text=search_query,
    vector_queries=[vector_query],
    top=5,
    select=["id", "name", "brand", "price", "description", "imageUrl", "descriptionVector"]
)

print(f"검색어: '{search_query}'")
print(f"\n{'='*60}")
print("[하이브리드 검색 결과]")
print(f"{'='*60}\n")

for idx, result in enumerate(results, 1):
    score = result['@search.score']
    print(f"{idx}. {result['name']} ({result['brand']})")
    print(f"   통합 점수: {score:.4f}")
    print(f"   설명: {result['description']}")
    print(f"   설명 벡터 (처음 5개 값): {result['descriptionVector'][:5]}...")
    display(HTML(f'<img src="{result["imageUrl"]}" width="200" />'))
    print()

## 7. 하이브리드 검색 (Hybrid Search) + Semantic Rank

### 7.1. Semantic Rank  추가

In [None]:
##############################################
# 7. Semantic Rank 추가
##############################################


# Add semantic configuration to hotels-sample-index and display updated index details
from azure.search.documents.indexes.models import (
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch
)


# Get the existing index
existing_index = index_client.get_index(AZURE_SEARCH_INDEX_NAME)

# Create a new semantic configuration
new_semantic_config = SemanticConfiguration(
    name="product-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="name"), 
        content_fields=[SemanticField(field_name="description")]
    )
)

# Add semantic configuration to the index
if existing_index.semantic_search is None:
    existing_index.semantic_search = SemanticSearch(configurations=[new_semantic_config])
else:
    # Check if configuration already exists
    config_exists = any(config.name == "product-semantic-config" 
                        for config in existing_index.semantic_search.configurations)
    if not config_exists:
        existing_index.semantic_search.configurations.append(new_semantic_config)

# Update the index
result = index_client.create_or_update_index(existing_index)

# Get the updated index and display detailed information
updated_index = index_client.get_index(AZURE_SEARCH_INDEX_NAME)

print("Semantic configurations:")
print("-" * 40)
if updated_index.semantic_search and updated_index.semantic_search.configurations:
    for config in updated_index.semantic_search.configurations:
        print(f"  Configuration: {config.name}")
        if config.prioritized_fields.title_field:
            print(f"    Title field: {config.prioritized_fields.title_field.field_name}")
        if config.prioritized_fields.keywords_fields:
            keywords = [kf.field_name for kf in config.prioritized_fields.keywords_fields]
            print(f"    Keywords fields: {', '.join(keywords)}")
        if config.prioritized_fields.content_fields:
            content = [cf.field_name for cf in config.prioritized_fields.content_fields]
            print(f"    Content fields: {', '.join(content)}")
        print()
else:
    print("  No semantic configurations found")

print("✅ Semantic configuration successfully added!")


### 7.2. Run Query

In [None]:
from azure.search.documents.models import VectorizedQuery

##############################################
# 7.2. 하이브리드 검색 (Hybrid Search) + Semantic Rank 쿼리
##############################################

search_query = "튼튼하고 편안한 가방으로, 컴퓨터를 넣을 수 있어야함."

# 검색어를 벡터로 변환
query_vector = get_embedding(search_query)

vector_query = VectorizedQuery(
    vector=query_vector,
    k_nearest_neighbors=5,
    fields="descriptionVector",
    kind="vector"
)

results = search_client.search(
    search_text=search_query,
    vector_queries=[vector_query],
    query_type="semantic",
    semantic_configuration_name="product-semantic-config",
    top=5,
    select=["id", "name", "brand", "price", "description", "imageUrl", "descriptionVector"],
)

print(f"검색어: '{search_query}'")
print(f"\n{'='*60}")
print("[하이브리드 + Semantic Ranker 검색 결과]")
print(f"{'='*60}")

for idx, result in enumerate(results, 1):
    score = result.get("@search.score", "N/A")
    reranker_score = result.get("@search.reranker_score", "N/A")
    print(f"{idx}. {result['name']} ({result['brand']})")
    print(f"   검색 점수: {score}")
    print(f"   Semantic Re-ranker Score: {reranker_score}")
    print(f"   설명: {result['description']}")
    print(f"   벡터 (처음 5개 값): {result['descriptionVector'][:5]}...")
    display(HTML(f'<img src="{result["imageUrl"]}" width="200" />'))


### 7.3. Semantic Configuration 제거

In [None]:
##############################################
# 7.3. Semantic Configuration 제거
##############################################

DELETE_SEMANTIC_CONFIG = False

if DELETE_SEMANTIC_CONFIG:
    # 기존 인덱스 가져오기
    existing_index = index_client.get_index(AZURE_SEARCH_INDEX_NAME)

    # product-semantic-config 제거
    if existing_index.semantic_search and existing_index.semantic_search.configurations:
        print("\n⚠️  'product-semantic-config' 제거 중...")
        
        # 해당 config 제거
        original_count = len(existing_index.semantic_search.configurations)
        existing_index.semantic_search.configurations = [
            config for config in existing_index.semantic_search.configurations 
            if config.name != "product-semantic-config"
        ]
        removed_count = original_count - len(existing_index.semantic_search.configurations)
        
        # configurations가 비어있으면 빈 리스트로 유지 (None이 아닌)
        # Azure Search는 빈 리스트를 허용함
        
        try:
            # 인덱스 업데이트
            index_client.create_or_update_index(existing_index)
            print(f"✅ {removed_count}개의 Configuration 제거 완료")
        except Exception as e:
            print(f"❌ 제거 실패: {e}")
            raise
    else:
        print("\nℹ️  제거할 Semantic Configuration이 없습니다.")