### Self-Query Retriever Test

In [None]:
"""
< Self-Query Retriever >

문서의 메타데이터를 기반으로 필터링 생성 --> 더욱 정확한 답변 가능
마치 데이터 테이블에서 SQL - 쿼리를 날리는것과 같음

Self-Query Retriever를 통해 데이터를 가져올때는, 질문하는것이 데이터를
필터링하는 형태의 질문/작업일때 적합함 (excel, csv 데이터 형태일때 더 유용)

** pip install lark 사전 설치 필요

"""

In [1]:
# Library
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings


# Documents 정보 리스트
docs = [
    Document(
        page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
        metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"},
    ),
    Document(
        page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
        metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
    ),
    Document(
        page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
        metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6},
    ),
    Document(
        page_content="A bunch of normal-sized women are supremely wholesome and some men pine after them",
        metadata={"year": 2019, "director": "Greta Gerwig", "rating": 8.3},
    ),
    Document(
        page_content="Toys come alive and have a blast doing so",
        metadata={"year": 1995, "genre": "animated"},
    ),
    Document(
        page_content="Three men walk into the Zone, three men walk out of the Zone",
        metadata={
            "year": 1979,
            "director": "Andrei Tarkovsky",
            "genre": "thriller",
            "rating": 9.9,
        },
    ),
]

# Embedding
model_name = "jhgan/ko-sbert-nli"
encode_kwargs = {"normalize_embeddings":True}
ko_embedding = HuggingFaceEmbeddings(
    model_name = model_name,
    encode_kwargs = encode_kwargs)

# Vectorstore
vectorstore = Chroma.from_documents(docs, ko_embedding)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Library
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chat_models import ChatOllama

# 다큐먼트 관련 metadata 정보 리스트
metadata_field_info = [
    AttributeInfo(
        name="genre",
        description="The genre of the movie. One of ['science fiction', 'comedy', 'drama', 'thriller', 'romance', 'action', 'animated']",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year of the movie released",
        type="integer"
    ),
    AttributeInfo(
        name="director",
        description="The name of the movie director",
        type="string"
    ),
    AttributeInfo(
        name="rating",
        description="A 1-10 rating for the movie",
        type="float"
    )
]

document_content_description = "Brief summary of a movie"
llm = ChatOllama(model="wizardlm2")
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [3]:
retriever.get_relevant_documents("rating이 8.5보다 높은 영화는 어떤게 있어?")

  warn_deprecated(


[Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'director': 'Satoshi Kon', 'rating': 8.6, 'year': 2006}),
 Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'director': 'Andrei Tarkovsky', 'genre': 'thriller', 'rating': 9.9, 'year': 1979})]