In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96
interactivity = ''
is_shiny = False
is_dashboard = False
plotly_connected = True

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  if plotly_connected:
    pio.renderers.default = "notebook_connected"
  else:
    pio.renderers.default = "notebook"
  for template in pio.templates.keys():
    pio.templates[template].layout.margin = dict(t=30,r=0,b=0,l=0)
except Exception:
  pass

# disable itables paging for dashboards
if is_dashboard:
  try:
    from itables import options
    options.dom = 'fiBrtlp'
    options.maxBytes = 1024 * 1024
    options.language = dict(info = "Showing _TOTAL_ entries")
    options.classes = "display nowrap compact"
    options.paging = False
    options.searching = True
    options.ordering = True
    options.info = True
    options.lengthChange = False
    options.autoWidth = False
    options.responsive = True
    options.keys = True
    options.buttons = []
  except Exception:
    pass
  
  try:
    import altair as alt
    # By default, dashboards will have container sized
    # vega visualizations which allows them to flow reasonably
    theme_sentinel = '_quarto-dashboard-internal'
    def make_theme(name):
        nonTheme = alt.themes._plugins[name]    
        def patch_theme(*args, **kwargs):
            existingTheme = nonTheme()
            if 'height' not in existingTheme:
              existingTheme['height'] = 'container'
            if 'width' not in existingTheme:
              existingTheme['width'] = 'container'

            if 'config' not in existingTheme:
              existingTheme['config'] = dict()
            
            # Configure the default font sizes
            title_font_size = 15
            header_font_size = 13
            axis_font_size = 12
            legend_font_size = 12
            mark_font_size = 12
            tooltip = False

            config = existingTheme['config']

            # The Axis
            if 'axis' not in config:
              config['axis'] = dict()
            axis = config['axis']
            if 'labelFontSize' not in axis:
              axis['labelFontSize'] = axis_font_size
            if 'titleFontSize' not in axis:
              axis['titleFontSize'] = axis_font_size  

            # The legend
            if 'legend' not in config:
              config['legend'] = dict()
            legend = config['legend']
            if 'labelFontSize' not in legend:
              legend['labelFontSize'] = legend_font_size
            if 'titleFontSize' not in legend:
              legend['titleFontSize'] = legend_font_size  

            # The header
            if 'header' not in config:
              config['header'] = dict()
            header = config['header']
            if 'labelFontSize' not in header:
              header['labelFontSize'] = header_font_size
            if 'titleFontSize' not in header:
              header['titleFontSize'] = header_font_size    

            # Title
            if 'title' not in config:
              config['title'] = dict()
            title = config['title']
            if 'fontSize' not in title:
              title['fontSize'] = title_font_size

            # Marks
            if 'mark' not in config:
              config['mark'] = dict()
            mark = config['mark']
            if 'fontSize' not in mark:
              mark['fontSize'] = mark_font_size

            # Mark tooltips
            if tooltip and 'tooltip' not in mark:
              mark['tooltip'] = dict(content="encoding")

            return existingTheme
            
        return patch_theme

    # We can only do this once per session
    if theme_sentinel not in alt.themes.names():
      for name in alt.themes.names():
        alt.themes.register(name, make_theme(name))
      
      # register a sentinel theme so we only do this once
      alt.themes.register(theme_sentinel, make_theme('default'))
      alt.themes.enable('default')

  except Exception:
    pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass

# interactivity
if interactivity:
  from IPython.core.interactiveshell import InteractiveShell
  InteractiveShell.ast_node_interactivity = interactivity

# NOTE: the kernel_deps code is repeated in the cleanup.py file
# (we can't easily share this code b/c of the way it is run).
# If you edit this code also edit the same code in cleanup.py!

# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\Agent\10-Retriever':
  os.chdir(r'C:\Users\kmkim\Desktop\projects\blog\docs\blog\posts\Agent\10-Retriever')

# reset state
%reset

# shiny
# Checking for shiny by using False directly because we're after the %reset. We don't want
# to set a variable that stays in global scope.
if False:
  try:
    import htmltools as _htmltools
    import ast as _ast

    _htmltools.html_dependency_render_mode = "json"

    # This decorator will be added to all function definitions
    def _display_if_has_repr_html(x):
      try:
        # IPython 7.14 preferred import
        from IPython.display import display, HTML
      except:
        from IPython.core.display import display, HTML

      if hasattr(x, '_repr_html_'):
        display(HTML(x._repr_html_()))
      return x

    # ideally we would undo the call to ast_transformers.append
    # at the end of this block whenver an error occurs, we do 
    # this for now as it will only be a problem if the user 
    # switches from shiny to not-shiny mode (and even then likely
    # won't matter)
    import builtins
    builtins._display_if_has_repr_html = _display_if_has_repr_html

    class _FunctionDefReprHtml(_ast.NodeTransformer):
      def visit_FunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

      def visit_AsyncFunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

    ip = get_ipython()
    ip.ast_transformers.append(_FunctionDefReprHtml())

  except:
    pass

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v

  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define


In [2]:
# API 키를 환경변수로 관리하기 위한 설정 파일  
from dotenv import load_dotenv  

# API 키 정보 로드  
load_dotenv()  

In [3]:
# LangSmith 추적을 설정 (https://smith.langchain.com)  
# !pip install langchain-teddynote  
from langchain_teddynote import logging  

# 프로젝트 이름을 입력  
logging.langsmith("CH10-Retriever")  

In [4]:
from langchain_chroma import Chroma  
from langchain_core.documents import Document  
from langchain_openai import OpenAIEmbeddings  

# 화장품 상품의 설명과 메타데이터 생성  
docs = [  
    Document(  
        page_content="수분 가득한 히알루론산 세럼으로 피부 속 깊은 곳까지 수분을 공급합니다.",  
        metadata={"year": 2024, "category": "스킨케어", "user_rating": 4.7},  
    ),  
    Document(  
        page_content="24시간 지속되는 매트한 피니시의 파운데이션, 모공을 커버하고 자연스러운 피부 표현이 가능합니다.",  
        metadata={"year": 2023, "category": "메이크업", "user_rating": 4.5},  
    ),  
    Document(  
        page_content="식물성 성분으로 만든 저자극 클렌징 오일, 메이크업과 노폐물을 부드럽게 제거합니다.",  
        metadata={"year": 2023, "category": "클렌징", "user_rating": 4.8},  
    ),  
    Document(  
        page_content="비타민 C 함유 브라이트닝 크림, 칙칙한 피부톤을 환하게 밝혀줍니다.",  
        metadata={"year": 2023, "category": "스킨케어", "user_rating": 4.6},  
    ),  
    Document(  
        page_content="롱래스팅 립스틱, 선명한 발색과 촉촉한 사용감으로 하루종일 편안하게 사용 가능합니다.",  
        metadata={"year": 2024, "category": "메이크업", "user_rating": 4.4},  
    ),  
    Document(  
        page_content="자외선 차단 기능이 있는 톤업 선크림, SPF50+/PA++++ 높은 자외선 차단 지수로 피부를 보호합니다.",  
        metadata={"year": 2024, "category": "선케어", "user_rating": 4.9},  
    ),  
]  

# 벡터 저장소 생성  
vectorstore = Chroma.from_documents(  
    docs,   
    OpenAIEmbeddings(model="text-embedding-3-small")  
)  

In [5]:
from langchain.chains.query_constructor.base import AttributeInfo  

# 메타데이터 필드 정보 생성  
metadata_field_info = [  
    AttributeInfo(  
        name="category",  
        description="The category of the cosmetic product. One of ['스킨케어', '메이크업', '클렌징', '선케어']",  
        type="string",  
    ),  
    AttributeInfo(  
        name="year",  
        description="The year the cosmetic product was released",  
        type="integer",  
    ),  
    AttributeInfo(  
        name="user_rating",  
        description="A user rating for the cosmetic product, ranging from 1 to 5",  
        type="float",  
    ),  
]  

In [6]:
from langchain.retrievers.self_query.base import SelfQueryRetriever  
from langchain_openai import ChatOpenAI  

# LLM 정의  
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)  

# SelfQueryRetriever 생성  
retriever = SelfQueryRetriever.from_llm(  
    llm=llm,  
    vectorstore=vectorstore,  
    document_contents="Brief summary of a cosmetic product",  
    metadata_field_info=metadata_field_info,  
)  

In [7]:
# 평점 4.8 이상 제품 검색  
retriever.invoke("평점이 4.8 이상인 제품을 추천해주세요")  

In [8]:
# 2023년 출시 제품 검색  
retriever.invoke("2023년에 출시된 상품을 추천해주세요")  

In [9]:
# 선케어 카테고리 제품 검색  
retriever.invoke("카테고리가 선케어인 상품을 추천해주세요")  

In [10]:
# 카테고리와 평점 조건을 동시에 적용  
retriever.invoke(  
    "카테고리가 메이크업인 상품 중에서 평점이 4.5 이상인 상품을 추천해주세요"  
)  

In [11]:
retriever = SelfQueryRetriever.from_llm(  
    llm=llm,  
    vectorstore=vectorstore,  
    document_contents="Brief summary of a cosmetic product",  
    metadata_field_info=metadata_field_info,  
    enable_limit=True,  # 검색 결과 제한 기능 활성화  
    search_kwargs={"k": 2},  # 최대 2개의 문서만 반환  
)  

In [12]:
# 검색 결과 2개로 제한  
retriever.invoke("2023년에 출시된 상품을 추천해주세요")  

In [13]:
retriever = SelfQueryRetriever.from_llm(  
    llm=llm,  
    vectorstore=vectorstore,  
    document_contents="Brief summary of a cosmetic product",  
    metadata_field_info=metadata_field_info,  
    enable_limit=True,  # 검색 결과 제한 기능 활성화  
)  

# 질의문에 "1개" 명시  
retriever.invoke("2023년에 출시된 상품 1개를 추천해주세요")  

In [14]:
# 질의문에 "2개" 명시  
retriever.invoke("2023년에 출시된 상품 2개를 추천해주세요")  

In [15]:
from langchain.chains.query_constructor.base import (  
    StructuredQueryOutputParser,  
    get_query_constructor_prompt,  
)  

# 문서 내용 설명과 메타데이터 필드 정보를 사용하여 쿼리 생성 프롬프트 가져오기  
prompt = get_query_constructor_prompt(  
    "Brief summary of a cosmetic product",  # 문서 내용 설명  
    metadata_field_info,  # 메타데이터 필드 정보  
)  

# 구조화된 쿼리 출력을 파싱하는 Parser 생성  
output_parser = StructuredQueryOutputParser.from_components()  

# Query Constructor Chain 구성: 프롬프트 → LLM → 파싱  
query_constructor = prompt | llm | output_parser  

In [16]:
# 프롬프트 템플릿 출력  
print(prompt.format(query="dummy question"))  

In [17]:
query_output = query_constructor.invoke(  
    {  
        "query": "2023년도에 출시한 상품 중 평점이 4.5 이상인 상품중에서 스킨케어 제품을 추천해주세요"  
    }  
)  

In [18]:
# 추출된 필터 조건 출력  
query_output.filter.arguments  

In [19]:
from langchain.retrievers.self_query.chroma import ChromaTranslator  

# 커스텀 Query Constructor와 Translator를 사용한 Retriever 생성  
retriever = SelfQueryRetriever(  
    query_constructor=query_constructor,  # 앞서 생성한 커스텀 query constructor  
    vectorstore=vectorstore,  # 벡터 저장소  
    structured_query_translator=ChromaTranslator(),  # Chroma 전용 쿼리 변환기  
)  

In [20]:
retriever.invoke(  
    "2023년도에 출시한 상품 중 평점이 4.5 이상인 상품중에서 스킨케어 제품을 추천해주세요"  
)  