In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96
interactivity = ''
is_shiny = False
is_dashboard = False
plotly_connected = True

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  if plotly_connected:
    pio.renderers.default = "notebook_connected"
  else:
    pio.renderers.default = "notebook"
  for template in pio.templates.keys():
    pio.templates[template].layout.margin = dict(t=30,r=0,b=0,l=0)
except Exception:
  pass

# disable itables paging for dashboards
if is_dashboard:
  try:
    from itables import options
    options.dom = 'fiBrtlp'
    options.maxBytes = 1024 * 1024
    options.language = dict(info = "Showing _TOTAL_ entries")
    options.classes = "display nowrap compact"
    options.paging = False
    options.searching = True
    options.ordering = True
    options.info = True
    options.lengthChange = False
    options.autoWidth = False
    options.responsive = True
    options.keys = True
    options.buttons = []
  except Exception:
    pass
  
  try:
    import altair as alt
    # By default, dashboards will have container sized
    # vega visualizations which allows them to flow reasonably
    theme_sentinel = '_quarto-dashboard-internal'
    def make_theme(name):
        nonTheme = alt.themes._plugins[name]    
        def patch_theme(*args, **kwargs):
            existingTheme = nonTheme()
            if 'height' not in existingTheme:
              existingTheme['height'] = 'container'
            if 'width' not in existingTheme:
              existingTheme['width'] = 'container'

            if 'config' not in existingTheme:
              existingTheme['config'] = dict()
            
            # Configure the default font sizes
            title_font_size = 15
            header_font_size = 13
            axis_font_size = 12
            legend_font_size = 12
            mark_font_size = 12
            tooltip = False

            config = existingTheme['config']

            # The Axis
            if 'axis' not in config:
              config['axis'] = dict()
            axis = config['axis']
            if 'labelFontSize' not in axis:
              axis['labelFontSize'] = axis_font_size
            if 'titleFontSize' not in axis:
              axis['titleFontSize'] = axis_font_size  

            # The legend
            if 'legend' not in config:
              config['legend'] = dict()
            legend = config['legend']
            if 'labelFontSize' not in legend:
              legend['labelFontSize'] = legend_font_size
            if 'titleFontSize' not in legend:
              legend['titleFontSize'] = legend_font_size  

            # The header
            if 'header' not in config:
              config['header'] = dict()
            header = config['header']
            if 'labelFontSize' not in header:
              header['labelFontSize'] = header_font_size
            if 'titleFontSize' not in header:
              header['titleFontSize'] = header_font_size    

            # Title
            if 'title' not in config:
              config['title'] = dict()
            title = config['title']
            if 'fontSize' not in title:
              title['fontSize'] = title_font_size

            # Marks
            if 'mark' not in config:
              config['mark'] = dict()
            mark = config['mark']
            if 'fontSize' not in mark:
              mark['fontSize'] = mark_font_size

            # Mark tooltips
            if tooltip and 'tooltip' not in mark:
              mark['tooltip'] = dict(content="encoding")

            return existingTheme
            
        return patch_theme

    # We can only do this once per session
    if theme_sentinel not in alt.themes.names():
      for name in alt.themes.names():
        alt.themes.register(name, make_theme(name))
      
      # register a sentinel theme so we only do this once
      alt.themes.register(theme_sentinel, make_theme('default'))
      alt.themes.enable('default')

  except Exception:
    pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass

# interactivity
if interactivity:
  from IPython.core.interactiveshell import InteractiveShell
  InteractiveShell.ast_node_interactivity = interactivity

# NOTE: the kernel_deps code is repeated in the cleanup.py file
# (we can't easily share this code b/c of the way it is run).
# If you edit this code also edit the same code in cleanup.py!

# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\RAG\13-Cloud-RAG':
  os.chdir(r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\RAG\13-Cloud-RAG')

# reset state
%reset

# shiny
# Checking for shiny by using False directly because we're after the %reset. We don't want
# to set a variable that stays in global scope.
if False:
  try:
    import htmltools as _htmltools
    import ast as _ast

    _htmltools.html_dependency_render_mode = "json"

    # This decorator will be added to all function definitions
    def _display_if_has_repr_html(x):
      try:
        # IPython 7.14 preferred import
        from IPython.display import display, HTML
      except:
        from IPython.core.display import display, HTML

      if hasattr(x, '_repr_html_'):
        display(HTML(x._repr_html_()))
      return x

    # ideally we would undo the call to ast_transformers.append
    # at the end of this block whenver an error occurs, we do 
    # this for now as it will only be a problem if the user 
    # switches from shiny to not-shiny mode (and even then likely
    # won't matter)
    import builtins
    builtins._display_if_has_repr_html = _display_if_has_repr_html

    class _FunctionDefReprHtml(_ast.NodeTransformer):
      def visit_FunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

      def visit_AsyncFunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

    ip = get_ipython()
    ip.ast_transformers.append(_FunctionDefReprHtml())

  except:
    pass

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v

  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define


In [2]:
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv
import os

load_dotenv()

# 환경 변수에서 연결 문자열 가져오기
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")

# BlobServiceClient 생성
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# 컨테이너 생성
container_name = "rag-documents"
container_client = blob_service_client.create_container(container_name)

print(f"컨테이너 '{container_name}' 생성 완료")

In [3]:
from azure.storage.blob import BlobClient

# Blob 클라이언트 생성
blob_client = blob_service_client.get_blob_client(
    container="rag-documents",
    blob="sample.pdf"
)

# 파일 업로드
with open("sample.pdf", "rb") as data:
    blob_client.upload_blob(data, overwrite=True)
    
print("파일 업로드 완료")

In [4]:
import os
from pathlib import Path

def upload_directory(container_name, local_directory):
    """디렉토리 내 모든 파일을 Blob Storage에 업로드"""
    container_client = blob_service_client.get_container_client(container_name)
    
    # 로컬 디렉토리의 모든 파일 탐색
    for root, dirs, files in os.walk(local_directory):
        for file in files:
            file_path = os.path.join(root, file)
            
            # Blob 이름 생성 (디렉토리 구조 유지)
            blob_name = os.path.relpath(file_path, local_directory).replace("\\", "/")
            
            # 파일 업로드
            with open(file_path, "rb") as data:
                blob_client = container_client.get_blob_client(blob_name)
                blob_client.upload_blob(data, overwrite=True)
                print(f"업로드: {blob_name}")

# 사용 예시
upload_directory("rag-documents", "./documents")

In [5]:
from datetime import datetime

# 메타데이터 정의
metadata = {
    "author": "Kwangmin Kim",
    "category": "AI",
    "upload_date": datetime.now().isoformat(),
    "version": "1.0"
}

# 메타데이터와 함께 업로드
with open("report.pdf", "rb") as data:
    blob_client = blob_service_client.get_blob_client(
        container="rag-documents",
        blob="report.pdf"
    )
    blob_client.upload_blob(
        data,
        metadata=metadata,
        overwrite=True
    )

print("메타데이터와 함께 업로드 완료")

In [6]:
# Blob 다운로드
blob_client = blob_service_client.get_blob_client(
    container="rag-documents",
    blob="sample.pdf"
)

# 로컬에 저장
with open("downloaded_sample.pdf", "wb") as download_file:
    download_file.write(blob_client.download_blob().readall())

print("파일 다운로드 완료")

In [7]:
# 스트리밍으로 다운로드 (메모리 효율적)
with open("large_file.pdf", "wb") as download_file:
    blob_data = blob_client.download_blob()
    
    # 청크 단위로 읽기 (기본 4MB)
    for chunk in blob_data.chunks():
        download_file.write(chunk)

print("대용량 파일 다운로드 완료")

In [8]:
# 컨테이너의 모든 Blob 가져오기
container_client = blob_service_client.get_container_client("rag-documents")
blob_list = container_client.list_blobs()

print("Blob 목록:")
for blob in blob_list:
    print(f"- {blob.name}")
    print(f"  크기: {blob.size / 1024:.2f} KB")
    print(f"  수정일: {blob.last_modified}")

In [9]:
# 메타데이터 포함 조회
blob_list = container_client.list_blobs(include=['metadata'])

for blob in blob_list:
    print(f"\n파일: {blob.name}")
    print(f"메타데이터: {blob.metadata}")

In [10]:
# "reports/" 폴더의 파일만 조회
blob_list = container_client.list_blobs(name_starts_with="reports/")

for blob in blob_list:
    print(f"- {blob.name}")

In [11]:
# Blob 삭제
blob_client = blob_service_client.get_blob_client(
    container="rag-documents",
    blob="old_file.pdf"
)

blob_client.delete_blob()
print("파일 삭제 완료")

In [12]:
from datetime import datetime, timedelta

# 30일 이상 된 파일 삭제
threshold_date = datetime.now() - timedelta(days=30)

container_client = blob_service_client.get_container_client("rag-documents")
blob_list = container_client.list_blobs()

for blob in blob_list:
    if blob.last_modified < threshold_date:
        blob_client = container_client.get_blob_client(blob.name)
        blob_client.delete_blob()
        print(f"삭제: {blob.name} (수정일: {blob.last_modified})")

In [13]:
from azure.storage.blob import generate_blob_sas, BlobSasPermissions
from datetime import datetime, timedelta

# SAS 토큰 생성 (읽기 전용, 1시간 유효)
sas_token = generate_blob_sas(
    account_name="stragdocs2025",
    container_name="rag-documents",
    blob_name="sample.pdf",
    account_key=os.getenv("AZURE_STORAGE_KEY"),
    permission=BlobSasPermissions(read=True),
    expiry=datetime.utcnow() + timedelta(hours=1)
)

# SAS URL 생성
sas_url = f"https://stragdocs2025.blob.core.windows.net/rag-documents/sample.pdf?{sas_token}"
print(f"SAS URL: {sas_url}")

In [14]:
import requests

# SAS URL로 파일 다운로드
response = requests.get(sas_url)

if response.status_code == 200:
    with open("downloaded_with_sas.pdf", "wb") as f:
        f.write(response.content)
    print("SAS 토큰으로 다운로드 완료")
else:
    print(f"다운로드 실패: {response.status_code}")

In [15]:
from typing import List
from langchain_core.documents import Document

def load_documents_from_blob(
    container_name: str,
    prefix: str = ""
) -> List[Document]:
    """Azure Blob Storage에서 문서 로딩"""
    
    container_client = blob_service_client.get_container_client(container_name)
    blob_list = container_client.list_blobs(
        name_starts_with=prefix,
        include=['metadata']
    )
    
    documents = []
    for blob in blob_list:
        # 지원 파일 형식만 처리
        if blob.name.endswith(('.pdf', '.txt', '.docx')):
            # Blob 다운로드
            blob_client = container_client.get_blob_client(blob.name)
            content = blob_client.download_blob().readall()
            
            # Document 객체 생성
            doc = Document(
                page_content=content.decode('utf-8', errors='ignore'),
                metadata={
                    "source": blob.name,
                    "size": blob.size,
                    "last_modified": blob.last_modified.isoformat(),
                    **blob.metadata  # Blob 메타데이터 추가
                }
            )
            documents.append(doc)
    
    return documents

# 사용 예시
documents = load_documents_from_blob("rag-documents", prefix="reports/")
print(f"로딩된 문서 수: {len(documents)}")

In [16]:
from langchain_community.document_loaders import AzureBlobStorageFileLoader

# Azure Blob Storage에서 파일 로딩
loader = AzureBlobStorageFileLoader(
    conn_str=os.getenv("AZURE_STORAGE_CONNECTION_STRING"),
    container="rag-documents",
    blob_name="sample.pdf"
)

documents = loader.load()
print(f"로딩된 문서: {len(documents)}")
print(f"첫 번째 문서 내용 (앞 200자):\n{documents[0].page_content[:200]}")

In [17]:
import gzip

# 텍스트 파일 압축 후 업로드
with open("large_document.txt", "rb") as f:
    content = f.read()
    compressed = gzip.compress(content)

blob_client = blob_service_client.get_blob_client(
    container="rag-documents",
    blob="large_document.txt.gz"
)

blob_client.upload_blob(
    compressed,
    metadata={"compressed": "gzip"},
    overwrite=True
)

print(f"원본 크기: {len(content)} bytes")
print(f"압축 크기: {len(compressed)} bytes ({len(compressed)/len(content)*100:.1f}%)")

In [18]:
from azure.identity import DefaultAzureCredential

# Managed Identity 사용 (키 노출 없음)
credential = DefaultAzureCredential()

blob_service_client = BlobServiceClient(
    account_url="https://stragdocs2025.blob.core.windows.net",
    credential=credential
)

# 이후 동일하게 사용
container_client = blob_service_client.get_container_client("rag-documents")

In [19]:
from azure.storage.blob import BlobAnalyticsLogging, Metrics, RetentionPolicy

# Analytics 설정
logging = BlobAnalyticsLogging(
    version="1.0",
    delete=True,
    read=True,
    write=True,
    retention_policy=RetentionPolicy(enabled=True, days=7)
)

metrics = Metrics(
    version="1.0",
    enabled=True,
    include_apis=True,
    retention_policy=RetentionPolicy(enabled=True, days=7)
)

# 설정 적용
blob_service_client.set_service_properties(
    analytics_logging=logging,
    hour_metrics=metrics,
    minute_metrics=metrics
)

print("Storage Analytics 활성화 완료")