In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96
interactivity = ''
is_shiny = False
is_dashboard = False
plotly_connected = True

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  if plotly_connected:
    pio.renderers.default = "notebook_connected"
  else:
    pio.renderers.default = "notebook"
  for template in pio.templates.keys():
    pio.templates[template].layout.margin = dict(t=30,r=0,b=0,l=0)
except Exception:
  pass

# disable itables paging for dashboards
if is_dashboard:
  try:
    from itables import options
    options.dom = 'fiBrtlp'
    options.maxBytes = 1024 * 1024
    options.language = dict(info = "Showing _TOTAL_ entries")
    options.classes = "display nowrap compact"
    options.paging = False
    options.searching = True
    options.ordering = True
    options.info = True
    options.lengthChange = False
    options.autoWidth = False
    options.responsive = True
    options.keys = True
    options.buttons = []
  except Exception:
    pass
  
  try:
    import altair as alt
    # By default, dashboards will have container sized
    # vega visualizations which allows them to flow reasonably
    theme_sentinel = '_quarto-dashboard-internal'
    def make_theme(name):
        nonTheme = alt.themes._plugins[name]    
        def patch_theme(*args, **kwargs):
            existingTheme = nonTheme()
            if 'height' not in existingTheme:
              existingTheme['height'] = 'container'
            if 'width' not in existingTheme:
              existingTheme['width'] = 'container'

            if 'config' not in existingTheme:
              existingTheme['config'] = dict()
            
            # Configure the default font sizes
            title_font_size = 15
            header_font_size = 13
            axis_font_size = 12
            legend_font_size = 12
            mark_font_size = 12
            tooltip = False

            config = existingTheme['config']

            # The Axis
            if 'axis' not in config:
              config['axis'] = dict()
            axis = config['axis']
            if 'labelFontSize' not in axis:
              axis['labelFontSize'] = axis_font_size
            if 'titleFontSize' not in axis:
              axis['titleFontSize'] = axis_font_size  

            # The legend
            if 'legend' not in config:
              config['legend'] = dict()
            legend = config['legend']
            if 'labelFontSize' not in legend:
              legend['labelFontSize'] = legend_font_size
            if 'titleFontSize' not in legend:
              legend['titleFontSize'] = legend_font_size  

            # The header
            if 'header' not in config:
              config['header'] = dict()
            header = config['header']
            if 'labelFontSize' not in header:
              header['labelFontSize'] = header_font_size
            if 'titleFontSize' not in header:
              header['titleFontSize'] = header_font_size    

            # Title
            if 'title' not in config:
              config['title'] = dict()
            title = config['title']
            if 'fontSize' not in title:
              title['fontSize'] = title_font_size

            # Marks
            if 'mark' not in config:
              config['mark'] = dict()
            mark = config['mark']
            if 'fontSize' not in mark:
              mark['fontSize'] = mark_font_size

            # Mark tooltips
            if tooltip and 'tooltip' not in mark:
              mark['tooltip'] = dict(content="encoding")

            return existingTheme
            
        return patch_theme

    # We can only do this once per session
    if theme_sentinel not in alt.themes.names():
      for name in alt.themes.names():
        alt.themes.register(name, make_theme(name))
      
      # register a sentinel theme so we only do this once
      alt.themes.register(theme_sentinel, make_theme('default'))
      alt.themes.enable('default')

  except Exception:
    pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass

# interactivity
if interactivity:
  from IPython.core.interactiveshell import InteractiveShell
  InteractiveShell.ast_node_interactivity = interactivity

# NOTE: the kernel_deps code is repeated in the cleanup.py file
# (we can't easily share this code b/c of the way it is run).
# If you edit this code also edit the same code in cleanup.py!

# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\Agent\13-Cloud-RAG':
  os.chdir(r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\Agent\13-Cloud-RAG')

# reset state
%reset

# shiny
# Checking for shiny by using False directly because we're after the %reset. We don't want
# to set a variable that stays in global scope.
if False:
  try:
    import htmltools as _htmltools
    import ast as _ast

    _htmltools.html_dependency_render_mode = "json"

    # This decorator will be added to all function definitions
    def _display_if_has_repr_html(x):
      try:
        # IPython 7.14 preferred import
        from IPython.display import display, HTML
      except:
        from IPython.core.display import display, HTML

      if hasattr(x, '_repr_html_'):
        display(HTML(x._repr_html_()))
      return x

    # ideally we would undo the call to ast_transformers.append
    # at the end of this block whenver an error occurs, we do 
    # this for now as it will only be a problem if the user 
    # switches from shiny to not-shiny mode (and even then likely
    # won't matter)
    import builtins
    builtins._display_if_has_repr_html = _display_if_has_repr_html

    class _FunctionDefReprHtml(_ast.NodeTransformer):
      def visit_FunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

      def visit_AsyncFunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

    ip = get_ipython()
    ip.ast_transformers.append(_FunctionDefReprHtml())

  except:
    pass

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v

  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define


In [2]:
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from dotenv import load_dotenv
import os

load_dotenv()

# 클라이언트 생성
endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(key)
)

# 로컬 PDF 파일 분석
with open("sample.pdf", "rb") as f:
    poller = document_analysis_client.begin_analyze_document(
        "prebuilt-layout", document=f
    )
    result = poller.result()

# 결과 출력
print(f"분석된 페이지 수: {len(result.pages)}")
print(f"추출된 단락 수: {len(result.paragraphs)}")
print(f"추출된 표 수: {len(result.tables)}")

In [3]:
# Azure Blob Storage URL로 분석
document_url = "https://stragdocs2025.blob.core.windows.net/rag-documents/sample.pdf?<sas-token>"

poller = document_analysis_client.begin_analyze_document_from_url(
    "prebuilt-layout", document_url=document_url
)
result = poller.result()

print("문서 분석 완료")

In [4]:
def extract_full_text(result):
    """문서에서 전체 텍스트 추출"""
    full_text = []
    
    # 페이지별 텍스트 추출
    for page in result.pages:
        page_text = []
        for line in page.lines:
            page_text.append(line.content)
        
        full_text.append("\n".join(page_text))
    
    return "\n\n".join(full_text)

# 사용 예시
text = extract_full_text(result)
print(f"추출된 텍스트 (앞 500자):\n{text[:500]}")

In [5]:
def extract_paragraphs(result):
    """단락 단위로 텍스트 추출 (레이아웃 유지)"""
    paragraphs = []
    
    for paragraph in result.paragraphs:
        paragraphs.append({
            "content": paragraph.content,
            "role": paragraph.role,  # title, sectionHeading, paragraph 등
            "page_number": paragraph.bounding_regions[0].page_number if paragraph.bounding_regions else None
        })
    
    return paragraphs

# 사용 예시
paragraphs = extract_paragraphs(result)
print(f"총 {len(paragraphs)}개 단락 추출")

# 제목만 추출
titles = [p for p in paragraphs if p["role"] == "title"]
print(f"\n문서 제목들:")
for title in titles:
    print(f"- {title['content']} (페이지 {title['page_number']})")

In [6]:
def extract_tables(result):
    """문서에서 표 추출"""
    tables_data = []
    
    for table_idx, table in enumerate(result.tables):
        # 표 메타데이터
        table_info = {
            "table_id": table_idx + 1,
            "row_count": table.row_count,
            "column_count": table.column_count,
            "page_number": table.bounding_regions[0].page_number,
            "cells": []
        }
        
        # 셀 데이터
        for cell in table.cells:
            table_info["cells"].append({
                "row_index": cell.row_index,
                "column_index": cell.column_index,
                "content": cell.content,
                "kind": cell.kind  # columnHeader, rowHeader, content, stub
            })
        
        tables_data.append(table_info)
    
    return tables_data

# 사용 예시
tables = extract_tables(result)
print(f"추출된 표 개수: {len(tables)}")

for table in tables:
    print(f"\n[표 {table['table_id']}] 페이지 {table['page_number']}")
    print(f"크기: {table['row_count']}행 × {table['column_count']}열")

In [7]:
import pandas as pd

def table_to_dataframe(table):
    """표를 Pandas DataFrame으로 변환"""
    # 표 초기화 (빈 셀 포함)
    data = [[None] * table["column_count"] for _ in range(table["row_count"])]
    
    # 셀 데이터 채우기
    for cell in table["cells"]:
        data[cell["row_index"]][cell["column_index"]] = cell["content"]
    
    # DataFrame 생성 (첫 행을 헤더로)
    df = pd.DataFrame(data[1:], columns=data[0])
    return df

# 사용 예시
if tables:
    df = table_to_dataframe(tables[0])
    print("\n표 데이터 (DataFrame):")
    print(df)

In [8]:
from langchain_core.documents import Document
from typing import List

def split_by_layout(result, max_chunk_size: int = 1000) -> List[Document]:
    """레이아웃 기반 문서 분할"""
    documents = []
    current_chunk = []
    current_size = 0
    current_page = 1
    
    for paragraph in result.paragraphs:
        # 단락 정보
        content = paragraph.content
        role = paragraph.role or "paragraph"
        page_num = paragraph.bounding_regions[0].page_number if paragraph.bounding_regions else current_page
        
        # 제목은 새로운 청크 시작
        if role in ["title", "sectionHeading"] and current_chunk:
            # 이전 청크 저장
            doc = Document(
                page_content="\n\n".join(current_chunk),
                metadata={
                    "page": current_page,
                    "chunk_type": "section"
                }
            )
            documents.append(doc)
            current_chunk = []
            current_size = 0
        
        # 현재 단락 추가
        current_chunk.append(content)
        current_size += len(content)
        current_page = page_num
        
        # 최대 크기 초과 시 청크 분할
        if current_size >= max_chunk_size:
            doc = Document(
                page_content="\n\n".join(current_chunk),
                metadata={
                    "page": current_page,
                    "chunk_type": "paragraph"
                }
            )
            documents.append(doc)
            current_chunk = []
            current_size = 0
    
    # 마지막 청크
    if current_chunk:
        doc = Document(
            page_content="\n\n".join(current_chunk),
            metadata={
                "page": current_page,
                "chunk_type": "paragraph"
            }
        )
        documents.append(doc)
    
    return documents

# 사용 예시
documents = split_by_layout(result, max_chunk_size=1000)
print(f"생성된 청크 수: {len(documents)}")
print(f"\n첫 번째 청크 (앞 300자):\n{documents[0].page_content[:300]}")

In [9]:
from azure.storage.blob import BlobServiceClient

# Blob Storage 클라이언트
blob_service_client = BlobServiceClient.from_connection_string(
    os.getenv("AZURE_STORAGE_CONNECTION_STRING")
)

def analyze_blob_document(container_name: str, blob_name: str):
    """Blob Storage의 문서를 Document Intelligence로 분석"""
    
    # Blob SAS URL 생성
    from azure.storage.blob import generate_blob_sas, BlobSasPermissions
    from datetime import datetime, timedelta
    
    sas_token = generate_blob_sas(
        account_name="stragdocs2025",
        container_name=container_name,
        blob_name=blob_name,
        account_key=os.getenv("AZURE_STORAGE_KEY"),
        permission=BlobSasPermissions(read=True),
        expiry=datetime.utcnow() + timedelta(hours=1)
    )
    
    blob_url = f"https://stragdocs2025.blob.core.windows.net/{container_name}/{blob_name}?{sas_token}"
    
    # Document Intelligence로 분석
    poller = document_analysis_client.begin_analyze_document_from_url(
        "prebuilt-layout", document_url=blob_url
    )
    result = poller.result()
    
    return result

# 사용 예시
result = analyze_blob_document("rag-documents", "sample.pdf")
print("Blob 문서 분석 완료")

In [10]:
def analyze_all_blobs(container_name: str):
    """컨테이너의 모든 문서 분석"""
    container_client = blob_service_client.get_container_client(container_name)
    blob_list = container_client.list_blobs()
    
    results = []
    for blob in blob_list:
        # PDF 파일만 처리
        if blob.name.endswith('.pdf'):
            print(f"분석 중: {blob.name}")
            try:
                result = analyze_blob_document(container_name, blob.name)
                text = extract_full_text(result)
                
                results.append({
                    "blob_name": blob.name,
                    "page_count": len(result.pages),
                    "text": text,
                    "status": "success"
                })
            except Exception as e:
                print(f"오류: {blob.name} - {str(e)}")
                results.append({
                    "blob_name": blob.name,
                    "status": "error",
                    "error": str(e)
                })
    
    return results

# 사용 예시
# results = analyze_all_blobs("rag-documents")
# print(f"총 {len(results)}개 문서 처리 완료")

In [11]:
# Read API로 문서 분석
with open("simple_document.pdf", "rb") as f:
    poller = document_analysis_client.begin_analyze_document(
        "prebuilt-read", document=f
    )
    result = poller.result()

# 텍스트만 추출 (레이아웃 무시)
full_text = result.content
print(f"추출된 텍스트:\n{full_text}")

In [12]:
# 한국어 문서 분석 (언어 힌트)
with open("korean_document.pdf", "rb") as f:
    poller = document_analysis_client.begin_analyze_document(
        "prebuilt-layout",
        document=f,
        locale="ko-KR"  # 한국어 힌트
    )
    result = poller.result()

print("한국어 문서 분석 완료")

In [13]:
import re

def clean_korean_text(text: str) -> str:
    """한글 OCR 결과 정리"""
    # 불필요한 공백 제거
    text = re.sub(r'\s+', ' ', text)
    
    # 줄바꿈 정리
    text = re.sub(r'\n\s*\n', '\n\n', text)
    
    # 특수문자 정리
    text = text.replace('〃', '"').replace('〃', '"')
    
    return text.strip()

# 사용 예시
raw_text = extract_full_text(result)
cleaned_text = clean_korean_text(raw_text)
print(f"정리된 텍스트:\n{cleaned_text[:500]}")

In [14]:
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader

# Document Intelligence Loader 생성
loader = AzureAIDocumentIntelligenceLoader(
    api_endpoint=endpoint,
    api_key=key,
    file_path="sample.pdf",
    api_model="prebuilt-layout"
)

# 문서 로딩
documents = loader.load()

print(f"로딩된 문서 수: {len(documents)}")
print(f"첫 번째 청크:\n{documents[0].page_content[:300]}")
print(f"메타데이터: {documents[0].metadata}")

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Document Intelligence로 문서 로딩
loader = AzureAIDocumentIntelligenceLoader(
    api_endpoint=endpoint,
    api_key=key,
    file_path="long_document.pdf",
    api_model="prebuilt-layout"
)
documents = loader.load()

# 텍스트 분할 (레이아웃 고려)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)
splits = text_splitter.split_documents(documents)

print(f"총 {len(splits)}개 청크 생성")

In [16]:
import json
import hashlib

def get_cache_key(file_path: str) -> str:
    """파일 해시로 캐시 키 생성"""
    with open(file_path, "rb") as f:
        file_hash = hashlib.md5(f.read()).hexdigest()
    return f"doc_intel_{file_hash}"

def analyze_with_cache(file_path: str):
    """캐싱을 사용한 문서 분석"""
    cache_key = get_cache_key(file_path)
    cache_file = f".cache/{cache_key}.json"
    
    # 캐시 확인
    try:
        with open(cache_file, "r", encoding="utf-8") as f:
            cached_result = json.load(f)
            print("캐시에서 결과 로딩")
            return cached_result
    except FileNotFoundError:
        pass
    
    # Document Intelligence 실행
    with open(file_path, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-layout", document=f
        )
        result = poller.result()
    
    # 결과를 딕셔너리로 변환
    result_dict = {
        "content": result.content,
        "pages": [{"page_number": p.page_number, "width": p.width, "height": p.height} for p in result.pages],
        "paragraphs": [{"content": p.content, "role": p.role} for p in result.paragraphs]
    }
    
    # 캐시 저장
    os.makedirs(".cache", exist_ok=True)
    with open(cache_file, "w", encoding="utf-8") as f:
        json.dump(result_dict, f, ensure_ascii=False, indent=2)
    
    print("Document Intelligence 실행 및 캐시 저장")
    return result_dict

# 사용 예시
# cached_result = analyze_with_cache("sample.pdf")

In [17]:
def analyze_documents_batch(file_paths: List[str], batch_size: int = 5):
    """배치 단위로 문서 분석 (API 제한 고려)"""
    import time
    
    results = []
    for i in range(0, len(file_paths), batch_size):
        batch = file_paths[i:i+batch_size]
        
        print(f"배치 {i//batch_size + 1} 처리 중 ({len(batch)}개 파일)")
        for file_path in batch:
            with open(file_path, "rb") as f:
                poller = document_analysis_client.begin_analyze_document(
                    "prebuilt-layout", document=f
                )
                result = poller.result()
                results.append({
                    "file": file_path,
                    "result": result
                })
        
        # API 제한 방지 (초당 15 요청)
        time.sleep(1)
    
    return results

# 사용 예시
# file_list = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
# results = analyze_documents_batch(file_list, batch_size=5)

In [18]:
def analyze_with_metrics(file_path: str):
    """분석 메트릭 추적"""
    import time
    
    start_time = time.time()
    
    with open(file_path, "rb") as f:
        file_size = os.path.getsize(file_path)
        
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-layout", document=f
        )
        result = poller.result()
    
    end_time = time.time()
    duration = end_time - start_time
    
    metrics = {
        "file_path": file_path,
        "file_size_mb": file_size / (1024 * 1024),
        "page_count": len(result.pages),
        "duration_seconds": duration,
        "pages_per_second": len(result.pages) / duration
    }
    
    print(f"분석 완료:")
    print(f"- 파일 크기: {metrics['file_size_mb']:.2f} MB")
    print(f"- 페이지 수: {metrics['page_count']}")
    print(f"- 처리 시간: {metrics['duration_seconds']:.2f}초")
    print(f"- 속도: {metrics['pages_per_second']:.2f} 페이지/초")
    
    return result, metrics

# 사용 예시
# result, metrics = analyze_with_metrics("large_document.pdf")