In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96
interactivity = ''
is_shiny = False
is_dashboard = False
plotly_connected = True

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  if plotly_connected:
    pio.renderers.default = "notebook_connected"
  else:
    pio.renderers.default = "notebook"
  for template in pio.templates.keys():
    pio.templates[template].layout.margin = dict(t=30,r=0,b=0,l=0)
except Exception:
  pass

# disable itables paging for dashboards
if is_dashboard:
  try:
    from itables import options
    options.dom = 'fiBrtlp'
    options.maxBytes = 1024 * 1024
    options.language = dict(info = "Showing _TOTAL_ entries")
    options.classes = "display nowrap compact"
    options.paging = False
    options.searching = True
    options.ordering = True
    options.info = True
    options.lengthChange = False
    options.autoWidth = False
    options.responsive = True
    options.keys = True
    options.buttons = []
  except Exception:
    pass
  
  try:
    import altair as alt
    # By default, dashboards will have container sized
    # vega visualizations which allows them to flow reasonably
    theme_sentinel = '_quarto-dashboard-internal'
    def make_theme(name):
        nonTheme = alt.themes._plugins[name]    
        def patch_theme(*args, **kwargs):
            existingTheme = nonTheme()
            if 'height' not in existingTheme:
              existingTheme['height'] = 'container'
            if 'width' not in existingTheme:
              existingTheme['width'] = 'container'

            if 'config' not in existingTheme:
              existingTheme['config'] = dict()
            
            # Configure the default font sizes
            title_font_size = 15
            header_font_size = 13
            axis_font_size = 12
            legend_font_size = 12
            mark_font_size = 12
            tooltip = False

            config = existingTheme['config']

            # The Axis
            if 'axis' not in config:
              config['axis'] = dict()
            axis = config['axis']
            if 'labelFontSize' not in axis:
              axis['labelFontSize'] = axis_font_size
            if 'titleFontSize' not in axis:
              axis['titleFontSize'] = axis_font_size  

            # The legend
            if 'legend' not in config:
              config['legend'] = dict()
            legend = config['legend']
            if 'labelFontSize' not in legend:
              legend['labelFontSize'] = legend_font_size
            if 'titleFontSize' not in legend:
              legend['titleFontSize'] = legend_font_size  

            # The header
            if 'header' not in config:
              config['header'] = dict()
            header = config['header']
            if 'labelFontSize' not in header:
              header['labelFontSize'] = header_font_size
            if 'titleFontSize' not in header:
              header['titleFontSize'] = header_font_size    

            # Title
            if 'title' not in config:
              config['title'] = dict()
            title = config['title']
            if 'fontSize' not in title:
              title['fontSize'] = title_font_size

            # Marks
            if 'mark' not in config:
              config['mark'] = dict()
            mark = config['mark']
            if 'fontSize' not in mark:
              mark['fontSize'] = mark_font_size

            # Mark tooltips
            if tooltip and 'tooltip' not in mark:
              mark['tooltip'] = dict(content="encoding")

            return existingTheme
            
        return patch_theme

    # We can only do this once per session
    if theme_sentinel not in alt.themes.names():
      for name in alt.themes.names():
        alt.themes.register(name, make_theme(name))
      
      # register a sentinel theme so we only do this once
      alt.themes.register(theme_sentinel, make_theme('default'))
      alt.themes.enable('default')

  except Exception:
    pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass

# interactivity
if interactivity:
  from IPython.core.interactiveshell import InteractiveShell
  InteractiveShell.ast_node_interactivity = interactivity

# NOTE: the kernel_deps code is repeated in the cleanup.py file
# (we can't easily share this code b/c of the way it is run).
# If you edit this code also edit the same code in cleanup.py!

# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\Agent\13-Cloud-RAG':
  os.chdir(r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\Agent\13-Cloud-RAG')

# reset state
%reset

# shiny
# Checking for shiny by using False directly because we're after the %reset. We don't want
# to set a variable that stays in global scope.
if False:
  try:
    import htmltools as _htmltools
    import ast as _ast

    _htmltools.html_dependency_render_mode = "json"

    # This decorator will be added to all function definitions
    def _display_if_has_repr_html(x):
      try:
        # IPython 7.14 preferred import
        from IPython.display import display, HTML
      except:
        from IPython.core.display import display, HTML

      if hasattr(x, '_repr_html_'):
        display(HTML(x._repr_html_()))
      return x

    # ideally we would undo the call to ast_transformers.append
    # at the end of this block whenver an error occurs, we do 
    # this for now as it will only be a problem if the user 
    # switches from shiny to not-shiny mode (and even then likely
    # won't matter)
    import builtins
    builtins._display_if_has_repr_html = _display_if_has_repr_html

    class _FunctionDefReprHtml(_ast.NodeTransformer):
      def visit_FunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

      def visit_AsyncFunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

    ip = get_ipython()
    ip.ast_transformers.append(_FunctionDefReprHtml())

  except:
    pass

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v

  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define


In [2]:
from openai import AzureOpenAI
from dotenv import load_dotenv
import os

load_dotenv()

# Azure OpenAI 클라이언트 생성
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# 텍스트 임베딩 생성
text = "Azure OpenAI는 Microsoft의 관리형 OpenAI 서비스다."
response = client.embeddings.create(
    input=text,
    model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
)

# 임베딩 벡터 추출
embedding = response.data[0].embedding

print(f"임베딩 차원: {len(embedding)}")
print(f"첫 10개 값: {embedding[:10]}")

In [3]:
# 여러 텍스트 동시 임베딩
texts = [
    "Azure는 Microsoft의 클라우드 플랫폼이다.",
    "RAG는 검색 증강 생성 기술이다.",
    "임베딩은 텍스트를 벡터로 변환한다."
]

response = client.embeddings.create(
    input=texts,
    model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
)

# 모든 임베딩 추출
embeddings = [data.embedding for data in response.data]

print(f"생성된 임베딩 수: {len(embeddings)}")
print(f"각 임베딩 차원: {len(embeddings[0])}")

In [4]:
from langchain_openai import AzureOpenAIEmbeddings

# LangChain 임베딩 클래스 생성
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT"),
    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY")
)

# 단일 텍스트 임베딩
text = "LangChain은 LLM 애플리케이션 프레임워크다."
embedding = embeddings.embed_query(text)

print(f"임베딩 차원: {len(embedding)}")
print(f"첫 5개 값: {embedding[:5]}")

In [5]:
from langchain_core.documents import Document

# 문서 목록 생성
documents = [
    Document(page_content="Azure AI Search는 벡터 데이터베이스다.", metadata={"source": "doc1"}),
    Document(page_content="Document Intelligence는 OCR 서비스다.", metadata={"source": "doc2"}),
    Document(page_content="Blob Storage는 파일 저장소다.", metadata={"source": "doc3"})
]

# 문서 텍스트만 추출하여 임베딩
texts = [doc.page_content for doc in documents]
doc_embeddings = embeddings.embed_documents(texts)

print(f"임베딩된 문서 수: {len(doc_embeddings)}")
print(f"첫 번째 문서 임베딩 차원: {len(doc_embeddings[0])}")

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader

# 1. 문서 로딩
loader = TextLoader("sample_document.txt", encoding="utf-8")
documents = loader.load()

# 2. 텍스트 분할
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)
chunks = text_splitter.split_documents(documents)

print(f"생성된 청크 수: {len(chunks)}")

# 3. 각 청크 임베딩
chunk_texts = [chunk.page_content for chunk in chunks]
chunk_embeddings = embeddings.embed_documents(chunk_texts)

print(f"임베딩 완료: {len(chunk_embeddings)}개 청크")

In [7]:
# 임베딩과 메타데이터를 함께 저장
embedded_chunks = []
for chunk, embedding in zip(chunks, chunk_embeddings):
    embedded_chunks.append({
        "text": chunk.page_content,
        "embedding": embedding,
        "metadata": {
            **chunk.metadata,
            "chunk_size": len(chunk.page_content),
            "embedding_model": "text-embedding-3-small"
        }
    })

print(f"첫 번째 청크 메타데이터: {embedded_chunks[0]['metadata']}")

In [8]:
import numpy as np

def cosine_similarity(vec1, vec2):
    """두 벡터 간 코사인 유사도 계산"""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# 예시: 질의와 문서 유사도 계산
query = "Azure의 AI 서비스는 무엇인가?"
doc1 = "Azure AI Search는 검색 서비스다."
doc2 = "Blob Storage는 파일 저장소다."

# 임베딩 생성
query_embedding = embeddings.embed_query(query)
doc1_embedding = embeddings.embed_query(doc1)
doc2_embedding = embeddings.embed_query(doc2)

# 유사도 계산
similarity1 = cosine_similarity(query_embedding, doc1_embedding)
similarity2 = cosine_similarity(query_embedding, doc2_embedding)

print(f"질의 vs 문서1 유사도: {similarity1:.4f}")
print(f"질의 vs 문서2 유사도: {similarity2:.4f}")

In [9]:
def find_most_similar(query_embedding, doc_embeddings, top_k=3):
    """가장 유사한 문서 찾기"""
    similarities = []
    for idx, doc_emb in enumerate(doc_embeddings):
        sim = cosine_similarity(query_embedding, doc_emb)
        similarities.append((idx, sim))
    
    # 유사도 높은 순으로 정렬
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_k]

# 사용 예시
query = "RAG 시스템 구축 방법"
query_emb = embeddings.embed_query(query)

# 상위 3개 유사 문서 찾기
top_docs = find_most_similar(query_emb, chunk_embeddings, top_k=3)

print("가장 유사한 문서:")
for idx, similarity in top_docs:
    print(f"청크 {idx}: {similarity:.4f}")
    print(f"내용: {chunks[idx].page_content[:100]}...\n")

In [10]:
# 512 차원으로 축소 (기본 1536)
response = client.embeddings.create(
    input="Azure OpenAI 임베딩 테스트",
    model="text-embedding-3-small",
    dimensions=512  # 1536 → 512
)

embedding_512 = response.data[0].embedding
print(f"축소된 차원: {len(embedding_512)}")

In [11]:
# 전체 차원 (1536) vs 축소 차원 (512) 비교
text1 = "Azure는 클라우드 플랫폼이다."
text2 = "Microsoft의 클라우드 서비스다."
text3 = "Python은 프로그래밍 언어다."

# 1536 차원
resp_full = client.embeddings.create(
    input=[text1, text2, text3],
    model="text-embedding-3-small"
)
emb_full = [d.embedding for d in resp_full.data]

# 512 차원
resp_reduced = client.embeddings.create(
    input=[text1, text2, text3],
    model="text-embedding-3-small",
    dimensions=512
)
emb_reduced = [d.embedding for d in resp_reduced.data]

# 유사도 비교
sim_full_12 = cosine_similarity(emb_full[0], emb_full[1])
sim_reduced_12 = cosine_similarity(emb_reduced[0], emb_reduced[1])

print(f"1536 차원: text1-text2 유사도 = {sim_full_12:.4f}")
print(f"512 차원: text1-text2 유사도 = {sim_reduced_12:.4f}")
print(f"차이: {abs(sim_full_12 - sim_reduced_12):.4f}")

In [12]:
def embed_documents_batch(texts, batch_size=100):
    """배치 단위로 대량 문서 임베딩"""
    all_embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        print(f"배치 {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1} 처리 중...")
        
        # 배치 임베딩
        batch_embeddings = embeddings.embed_documents(batch)
        all_embeddings.extend(batch_embeddings)
    
    return all_embeddings

# 사용 예시 (1000개 문서)
# large_texts = [f"문서 {i} 내용..." for i in range(1000)]
# embeddings_result = embed_documents_batch(large_texts, batch_size=100)

In [13]:
from concurrent.futures import ThreadPoolExecutor
import time

def embed_chunk(texts_chunk):
    """청크 단위 임베딩 (병렬 실행용)"""
    return embeddings.embed_documents(texts_chunk)

def embed_parallel(texts, num_workers=4, chunk_size=25):
    """병렬로 임베딩 생성"""
    # 청크로 분할
    chunks = [texts[i:i+chunk_size] for i in range(0, len(texts), chunk_size)]
    
    # 병렬 실행
    start_time = time.time()
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        results = list(executor.map(embed_chunk, chunks))
    
    # 결과 병합
    all_embeddings = [emb for chunk_result in results for emb in chunk_result]
    
    duration = time.time() - start_time
    print(f"병렬 처리 완료: {len(all_embeddings)}개 임베딩, {duration:.2f}초")
    
    return all_embeddings

# 사용 예시
# texts_sample = [f"샘플 텍스트 {i}" for i in range(100)]
# parallel_embeddings = embed_parallel(texts_sample, num_workers=4, chunk_size=25)

In [14]:
import hashlib
import json

class EmbeddingCache:
    def __init__(self, cache_file=".embedding_cache.json"):
        self.cache_file = cache_file
        self.cache = self._load_cache()
    
    def _load_cache(self):
        """캐시 파일 로드"""
        try:
            with open(self.cache_file, "r") as f:
                return json.load(f)
        except FileNotFoundError:
            return {}
    
    def _save_cache(self):
        """캐시 파일 저장"""
        with open(self.cache_file, "w") as f:
            json.dump(self.cache, f)
    
    def _get_hash(self, text):
        """텍스트 해시 생성"""
        return hashlib.md5(text.encode()).hexdigest()
    
    def get_embedding(self, text, embeddings_func):
        """캐시된 임베딩 반환 또는 생성"""
        text_hash = self._get_hash(text)
        
        # 캐시 확인
        if text_hash in self.cache:
            print(f"캐시 히트: {text[:50]}...")
            return self.cache[text_hash]
        
        # 임베딩 생성
        print(f"캐시 미스: {text[:50]}...")
        embedding = embeddings_func(text)
        
        # 캐시 저장
        self.cache[text_hash] = embedding
        self._save_cache()
        
        return embedding

# 사용 예시
cache = EmbeddingCache()

text = "Azure OpenAI 서비스"
emb1 = cache.get_embedding(text, embeddings.embed_query)  # 캐시 미스
emb2 = cache.get_embedding(text, embeddings.embed_query)  # 캐시 히트 (즉시 반환)

In [15]:
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore

# 로컬 파일 캐시 스토어
store = LocalFileStore("./.embedding_cache/")

# 캐시가 적용된 임베딩
cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings=embeddings,
    document_embedding_cache=store,
    namespace="azure-openai-embeddings"
)

# 사용 (자동 캐싱)
texts = ["Azure AI", "OpenAI Service", "Azure AI"]  # "Azure AI" 중복
embeddings_result = cached_embedder.embed_documents(texts)
# 두 번째 "Azure AI"는 캐시에서 즉시 반환

In [16]:
import tiktoken

def count_tokens(text, model="cl100k_base"):
    """텍스트의 토큰 수 계산"""
    encoding = tiktoken.get_encoding(model)
    return len(encoding.encode(text))

# 예시
text = "Azure OpenAI Embeddings는 텍스트를 벡터로 변환하는 서비스다."
token_count = count_tokens(text)

print(f"텍스트: {text}")
print(f"토큰 수: {token_count}")

In [17]:
def estimate_cost(token_count, model="text-embedding-3-small"):
    """임베딩 비용 추정"""
    prices = {
        "text-embedding-ada-002": 0.0001,  # $0.10 / 1M tokens
        "text-embedding-3-small": 0.00002,  # $0.02 / 1M tokens
        "text-embedding-3-large": 0.00013   # $0.13 / 1M tokens
    }
    
    price_per_token = prices.get(model, 0)
    cost = token_count * price_per_token
    
    return cost

# 예시: 10,000개 문서 (각 500 토큰)
total_tokens = 10000 * 500
cost_small = estimate_cost(total_tokens, "text-embedding-3-small")
cost_large = estimate_cost(total_tokens, "text-embedding-3-large")

print(f"총 토큰: {total_tokens:,}")
print(f"text-embedding-3-small 비용: ${cost_small:.2f}")
print(f"text-embedding-3-large 비용: ${cost_large:.2f}")

In [18]:
from langchain_community.vectorstores.azuresearch import AzureSearch

# Azure Search VectorStore 초기화
vector_store = AzureSearch(
    azure_search_endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
    azure_search_key=os.getenv("AZURE_SEARCH_API_KEY"),
    index_name="rag-embeddings",
    embedding_function=embeddings.embed_query
)

# 문서 추가 (자동 임베딩)
from langchain_core.documents import Document

documents = [
    Document(page_content="Azure는 Microsoft의 클라우드 플랫폼이다.", metadata={"source": "doc1"}),
    Document(page_content="RAG는 검색 증강 생성 기술이다.", metadata={"source": "doc2"}),
    Document(page_content="임베딩은 텍스트를 벡터로 변환한다.", metadata={"source": "doc3"})
]

# 문서 추가 (임베딩 자동 생성 및 업로드)
vector_store.add_documents(documents)
print(f"{len(documents)}개 문서가 Azure AI Search에 추가되었다.")

In [19]:
def check_embedding_quality(embeddings_list):
    """임베딩 품질 메트릭"""
    if not embeddings_list:
        return None
    
    # 벡터 길이 (L2 norm)
    norms = [np.linalg.norm(emb) for emb in embeddings_list]
    
    # 차원별 통계
    embeddings_array = np.array(embeddings_list)
    
    metrics = {
        "count": len(embeddings_list),
        "dimensions": len(embeddings_list[0]),
        "norm_mean": np.mean(norms),
        "norm_std": np.std(norms),
        "value_mean": np.mean(embeddings_array),
        "value_std": np.std(embeddings_array)
    }
    
    return metrics

# 사용 예시
sample_texts = ["Azure", "OpenAI", "RAG", "Embeddings"]
sample_embeddings = embeddings.embed_documents(sample_texts)

quality = check_embedding_quality(sample_embeddings)
print("임베딩 품질 메트릭:")
for key, value in quality.items():
    print(f"- {key}: {value}")