In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96
interactivity = ''
is_shiny = False
is_dashboard = False
plotly_connected = True

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  if plotly_connected:
    pio.renderers.default = "notebook_connected"
  else:
    pio.renderers.default = "notebook"
  for template in pio.templates.keys():
    pio.templates[template].layout.margin = dict(t=30,r=0,b=0,l=0)
except Exception:
  pass

# disable itables paging for dashboards
if is_dashboard:
  try:
    from itables import options
    options.dom = 'fiBrtlp'
    options.maxBytes = 1024 * 1024
    options.language = dict(info = "Showing _TOTAL_ entries")
    options.classes = "display nowrap compact"
    options.paging = False
    options.searching = True
    options.ordering = True
    options.info = True
    options.lengthChange = False
    options.autoWidth = False
    options.responsive = True
    options.keys = True
    options.buttons = []
  except Exception:
    pass
  
  try:
    import altair as alt
    # By default, dashboards will have container sized
    # vega visualizations which allows them to flow reasonably
    theme_sentinel = '_quarto-dashboard-internal'
    def make_theme(name):
        nonTheme = alt.themes._plugins[name]    
        def patch_theme(*args, **kwargs):
            existingTheme = nonTheme()
            if 'height' not in existingTheme:
              existingTheme['height'] = 'container'
            if 'width' not in existingTheme:
              existingTheme['width'] = 'container'

            if 'config' not in existingTheme:
              existingTheme['config'] = dict()
            
            # Configure the default font sizes
            title_font_size = 15
            header_font_size = 13
            axis_font_size = 12
            legend_font_size = 12
            mark_font_size = 12
            tooltip = False

            config = existingTheme['config']

            # The Axis
            if 'axis' not in config:
              config['axis'] = dict()
            axis = config['axis']
            if 'labelFontSize' not in axis:
              axis['labelFontSize'] = axis_font_size
            if 'titleFontSize' not in axis:
              axis['titleFontSize'] = axis_font_size  

            # The legend
            if 'legend' not in config:
              config['legend'] = dict()
            legend = config['legend']
            if 'labelFontSize' not in legend:
              legend['labelFontSize'] = legend_font_size
            if 'titleFontSize' not in legend:
              legend['titleFontSize'] = legend_font_size  

            # The header
            if 'header' not in config:
              config['header'] = dict()
            header = config['header']
            if 'labelFontSize' not in header:
              header['labelFontSize'] = header_font_size
            if 'titleFontSize' not in header:
              header['titleFontSize'] = header_font_size    

            # Title
            if 'title' not in config:
              config['title'] = dict()
            title = config['title']
            if 'fontSize' not in title:
              title['fontSize'] = title_font_size

            # Marks
            if 'mark' not in config:
              config['mark'] = dict()
            mark = config['mark']
            if 'fontSize' not in mark:
              mark['fontSize'] = mark_font_size

            # Mark tooltips
            if tooltip and 'tooltip' not in mark:
              mark['tooltip'] = dict(content="encoding")

            return existingTheme
            
        return patch_theme

    # We can only do this once per session
    if theme_sentinel not in alt.themes.names():
      for name in alt.themes.names():
        alt.themes.register(name, make_theme(name))
      
      # register a sentinel theme so we only do this once
      alt.themes.register(theme_sentinel, make_theme('default'))
      alt.themes.enable('default')

  except Exception:
    pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass

# interactivity
if interactivity:
  from IPython.core.interactiveshell import InteractiveShell
  InteractiveShell.ast_node_interactivity = interactivity

# NOTE: the kernel_deps code is repeated in the cleanup.py file
# (we can't easily share this code b/c of the way it is run).
# If you edit this code also edit the same code in cleanup.py!

# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\RAG\13-Cloud-RAG':
  os.chdir(r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\RAG\13-Cloud-RAG')

# reset state
%reset

# shiny
# Checking for shiny by using False directly because we're after the %reset. We don't want
# to set a variable that stays in global scope.
if False:
  try:
    import htmltools as _htmltools
    import ast as _ast

    _htmltools.html_dependency_render_mode = "json"

    # This decorator will be added to all function definitions
    def _display_if_has_repr_html(x):
      try:
        # IPython 7.14 preferred import
        from IPython.display import display, HTML
      except:
        from IPython.core.display import display, HTML

      if hasattr(x, '_repr_html_'):
        display(HTML(x._repr_html_()))
      return x

    # ideally we would undo the call to ast_transformers.append
    # at the end of this block whenver an error occurs, we do 
    # this for now as it will only be a problem if the user 
    # switches from shiny to not-shiny mode (and even then likely
    # won't matter)
    import builtins
    builtins._display_if_has_repr_html = _display_if_has_repr_html

    class _FunctionDefReprHtml(_ast.NodeTransformer):
      def visit_FunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

      def visit_AsyncFunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

    ip = get_ipython()
    ip.ast_transformers.append(_FunctionDefReprHtml())

  except:
    pass

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v

  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define


In [2]:
from langchain_openai import AzureChatOpenAI
from dotenv import load_dotenv
import os

load_dotenv()

llm = AzureChatOpenAI(
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    temperature=0,  # 결정적 출력
    max_tokens=1000  # 최대 출력 길이
)

# 테스트
response = llm.invoke("Azure OpenAI란 무엇인가?")
print(response.content)

In [3]:
from langchain_core.messages import HumanMessage, SystemMessage

messages = [
    SystemMessage(content="당신은 Azure 전문가입니다."),
    HumanMessage(content="Azure AI Search를 설명해주세요.")
]

response = llm.invoke(messages)
print(response.content)

In [4]:
from langchain_core.prompts import ChatPromptTemplate

basic_prompt = ChatPromptTemplate.from_template(
    """다음 컨텍스트를 참고하여 질문에 답변하세요.

컨텍스트:
{context}

질문: {question}

답변:"""
)

In [5]:
korean_prompt = ChatPromptTemplate.from_template(
    """당신은 친절한 AI 어시스턴트입니다.
주어진 컨텍스트를 바탕으로 사용자의 질문에 정확하고 상세하게 답변하세요.

## 지침:
1. 컨텍스트에 있는 정보만 사용하세요
2. 확실하지 않으면 "잘 모르겠습니다"라고 답하세요
3. 답변은 한국어로 작성하세요
4. 전문 용어는 쉽게 설명하세요

## 컨텍스트:
{context}

## 질문:
{question}

## 답변:"""
)

In [6]:
structured_prompt = ChatPromptTemplate.from_template(
    """다음 컨텍스트를 참고하여 질문에 답변하세요.

컨텍스트:
{context}

질문: {question}

답변은 다음 형식으로 작성하세요:

**요약:** (한 문장 요약)

**상세 설명:**
- 핵심 포인트 1
- 핵심 포인트 2
- 핵심 포인트 3

**출처:** (컨텍스트에서 인용)

답변:"""
)

In [7]:
few_shot_prompt = ChatPromptTemplate.from_template(
    """다음 예시를 참고하여 질문에 답변하세요.

예시 1:
질문: Azure Blob Storage란?
답변: Azure Blob Storage는 Microsoft의 클라우드 객체 스토리지 서비스입니다. 
대량의 비구조화 데이터를 저장할 수 있으며, Hot/Cool/Archive 티어를 제공합니다.

예시 2:
질문: Document Intelligence의 용도는?
답변: Document Intelligence는 OCR 및 문서 분석 서비스입니다.
PDF, 이미지에서 텍스트, 표, 레이아웃을 추출하여 RAG 시스템의 입력으로 사용합니다.

이제 실제 질문에 답변하세요:

컨텍스트:
{context}

질문: {question}

답변:"""
)

In [8]:
def compress_context(docs, max_tokens=3000):
    """컨텍스트를 토큰 제한 내로 압축"""
    import tiktoken
    
    encoding = tiktoken.encoding_for_model("gpt-4")
    
    compressed = []
    total_tokens = 0
    
    for doc in docs:
        tokens = encoding.encode(doc.page_content)
        doc_tokens = len(tokens)
        
        if total_tokens + doc_tokens <= max_tokens:
            compressed.append(doc.page_content)
            total_tokens += doc_tokens
        else:
            # 남은 공간에 맞춰 자르기
            remaining = max_tokens - total_tokens
            truncated = encoding.decode(tokens[:remaining])
            compressed.append(truncated + "...")
            break
    
    return "\n\n".join(compressed)

# 사용
# context = compress_context(retrieved_docs, max_tokens=3000)

In [9]:
from langchain.document_transformers import LongContextReorder

def reorder_context(docs):
    """중요한 문서를 양 끝에 배치"""
    reorderer = LongContextReorder()
    reordered = reorderer.transform_documents(docs)
    return reordered

# 사용: 첫 번째와 마지막 문서가 가장 중요
# reordered_docs = reorder_context(retrieved_docs)

In [10]:
cot_prompt = ChatPromptTemplate.from_template(
    """다음 컨텍스트를 참고하여 질문에 단계적으로 답변하세요.

컨텍스트:
{context}

질문: {question}

단계별 추론:
1. 먼저 질문의 핵심을 파악합니다
2. 컨텍스트에서 관련 정보를 찾습니다
3. 정보를 종합하여 답변을 구성합니다

답변:"""
)

In [11]:
self_ask_prompt = ChatPromptTemplate.from_template(
    """질문에 답하기 위해 필요한 하위 질문을 먼저 생성하고 답변하세요.

컨텍스트:
{context}

질문: {question}

하위 질문 1: [질문]
답변 1: [답변]

하위 질문 2: [질문]
답변 2: [답변]

최종 답변: [종합 답변]"""
)

In [12]:
citation_prompt = ChatPromptTemplate.from_template(
    """컨텍스트를 참고하여 답변하고, 인용 출처를 표시하세요.

컨텍스트:
{context}

질문: {question}

답변 형식:
답변 내용 [출처: 문서명 또는 페이지]

답변:"""
)

In [13]:
# 스트리밍 활성화
streaming_llm = AzureChatOpenAI(
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    streaming=True,
    temperature=0
)

# 스트리밍 실행
for chunk in streaming_llm.stream("Azure AI Search란?"):
    print(chunk.content, end="", flush=True)

In [14]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# 스트리밍 RAG 체인
streaming_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | korean_prompt
    | streaming_llm
    | StrOutputParser()
)

# 실행
print("답변: ", end="")
for chunk in streaming_chain.stream("Azure AI Search의 장점은?"):
    print(chunk, end="", flush=True)
print()

In [15]:
json_llm = AzureChatOpenAI(
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    temperature=0,
    model_kwargs={"response_format": {"type": "json_object"}}
)

json_prompt = ChatPromptTemplate.from_template(
    """다음 컨텍스트를 참고하여 질문에 JSON 형식으로 답변하세요.

컨텍스트:
{context}

질문: {question}

JSON 형식:
{{
  "answer": "답변 내용",
  "confidence": "high/medium/low",
  "sources": ["출처1", "출처2"]
}}

JSON 응답:"""
)

# 사용
response = json_llm.invoke(
    json_prompt.invoke({
        "context": "Azure AI Search는 벡터 검색을 지원합니다.",
        "question": "Azure AI Search의 주요 기능은?"
    })
)

import json
result = json.loads(response.content)
print(f"답변: {result['answer']}")
print(f"신뢰도: {result['confidence']}")

In [16]:
from langchain_core.tools import tool

@tool
def search_documents(query: str) -> str:
    """문서를 검색하는 도구"""
    # 실제로는 retriever 사용
    return f"'{query}'에 대한 검색 결과입니다."

@tool
def get_metadata(doc_id: str) -> dict:
    """문서 메타데이터를 가져오는 도구"""
    return {"id": doc_id, "title": "샘플 문서", "date": "2025-01-01"}

tools = [search_documents, get_metadata]

In [17]:
# 도구 바인딩
llm_with_tools = llm.bind_tools(tools)

# 실행
response = llm_with_tools.invoke("Azure AI Search 문서를 검색해줘")

# 도구 호출 확인
if response.tool_calls:
    for tool_call in response.tool_calls:
        print(f"도구: {tool_call['name']}")
        print(f"인자: {tool_call['args']}")

In [18]:
import tiktoken

def count_tokens(text, model="gpt-4"):
    """토큰 수 계산"""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

# 프롬프트 토큰 확인
prompt_text = korean_prompt.invoke({
    "context": "샘플 컨텍스트",
    "question": "샘플 질문"
}).to_string()

tokens = count_tokens(prompt_text)
print(f"프롬프트 토큰: {tokens}")

In [19]:
def estimate_cost(input_tokens, output_tokens, model="gpt-4o"):
    """비용 추정 (USD)"""
    prices = {
        "gpt-4o": {"input": 5.0, "output": 15.0},  # per 1M tokens
        "gpt-4-turbo": {"input": 10.0, "output": 30.0},
        "gpt-3.5-turbo": {"input": 0.5, "output": 1.5}
    }
    
    price = prices.get(model, prices["gpt-4o"])
    
    input_cost = (input_tokens / 1_000_000) * price["input"]
    output_cost = (output_tokens / 1_000_000) * price["output"]
    
    return input_cost + output_cost

# 예시: 1000개 질문 (각 3000 입력, 500 출력 토큰)
total_cost = estimate_cost(3000 * 1000, 500 * 1000, "gpt-4o")
print(f"예상 비용: ${total_cost:.2f}")

In [20]:
from langchain.callbacks import StdOutCallbackHandler

# 콜백 핸들러
callback = StdOutCallbackHandler()

# LLM 호출 시 콜백 전달
response = llm.invoke(
    "Azure AI Search란?",
    config={"callbacks": [callback]}
)

# 토큰 사용량 확인
print(f"토큰 사용량: {response.response_metadata.get('token_usage')}")

In [21]:
from tenacity import retry, stop_after_attempt, wait_exponential

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=10)
)
def invoke_llm_with_retry(messages):
    """재시도 가능한 LLM 호출"""
    return llm.invoke(messages)

# 사용
try:
    response = invoke_llm_with_retry("테스트 질문")
    print(response.content)
except Exception as e:
    print(f"LLM 호출 실패: {e}")

In [22]:
import time

def invoke_with_rate_limit(query, requests_per_minute=60):
    """Rate limit을 고려한 호출"""
    sleep_time = 60 / requests_per_minute
    
    response = llm.invoke(query)
    time.sleep(sleep_time)
    
    return response