# Features
- retrieval routing
- metadata filter on year


In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

True

## 1. Embedding

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


# 2. Indexing

In [3]:
# Load data

import json
from langchain.schema import Document

datapath = '../data/data_info.txt'

with open(datapath, "r") as file:
    raw_data = file.read()

corpus = json.loads(raw_data)
corpus[0].keys()

dict_keys(['title', 'abstract', 'keywords', 'year', 'doi', 'authors', 'full text', 'pages', 'content'])

In [4]:
# Preprocess and split data
from langchain.text_splitter import RecursiveCharacterTextSplitter

Abstract_Store = []

for thesis in corpus:
    document = Document(
    page_content=thesis['content'],
    metadata={
        "title": thesis['title'],
        "year": thesis['year'],
        "abstract": thesis['abstract']
    })
    Abstract_Store.append(document)

print(len(Abstract_Store))
print(Abstract_Store[0].metadata)

Content_Store = []

for thesis in corpus:
    document = Document(
    page_content=thesis['content'],
    metadata={
        "title": thesis['title'],
        "year": thesis['year'],
        "abstract": thesis['abstract']
    })
    Content_Store.append(document)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
splits = text_splitter.split_documents(Content_Store)
print(len(splits))

100
{'title': 'A Critical Survey on the use of Fuzzy Sets in Speech and Natural Language Processing', 'year': 2012, 'abstract': 'This paper shows how the use and applications of Fuzzy Sets (FS) in Speech and Natural Language Processing (SNLP) have seen a steady decline to a point where FS are virtually unknown or unappealing for most of the researchers currently working in the SNLP field, tries to find the reasons behind this decline, and proposes some guidelines on what could be done to reverse it and make FS assume a relevant role in SNLP.'}
4716


In [5]:
# Store the data
from langchain_community.vectorstores import Chroma

Abstract_Store = Chroma.from_documents(documents=Abstract_Store, embedding=embedder)
Content_Store = Chroma.from_documents(documents=splits, embedding=embedder)

# 3.Routing - logical routing

Routing reference [here](https://python.langchain.com/v0.1/docs/use_cases/query_analysis/techniques/routing/)

In [6]:
from typing import Literal

from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from tools.custom_chat_model import RedPillChatModel

# Data model
class RouteQuery(BaseModel):
    """Route a user query to the most relevant datasource."""

    datasource: Literal["Abstract_Store", "Content_Store"] = Field(
        ...,
        description="Abstract_Store is a database with abstracts of papers in the natural language field, Content_Store is a databnase with the full text of papers in the natural language field. Given a user question choose which datasource would be most relevant for answering their question. For Summarization or more general use cases, route to Abstract_Store, only if asked on concepts or specific content route to Content_Store.",
    )

# LLM with function call 
llm = RedPillChatModel(model="gpt-4o", 
                 api_key=os.getenv("RED_PILL_API_KEY"),
                 temperature = 0)
routing_llm = llm.with_structured_output(RouteQuery)

# Prompt 
system = """You are an expert at routing a user question to the appropriate data source.

Based on the programming language the question is referring to, route it to the relevant data source."""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

# Define router 
router = prompt | routing_llm

# 4. Self Querying Retrieval

self querying retrieval reference [here](https://python.langchain.com/docs/how_to/self_query/) and [here](https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_10_and_11.ipynb)

In [7]:
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from tools.customllm import RedPillLLM


metadata_field_info = [
    AttributeInfo(
        name="title",
        description="The title of the thesis",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year the thesis was published",
        type="integer",
    ),
    AttributeInfo(
        name="abstract",
        description="The abstract of the thesis",
        type="integer",
    ),
]
document_content_description = "Thesis in the natural language processing field"

llm = RedPillLLM(model="gpt-4o", 
                 api_key=os.getenv("RED_PILL_API_KEY"),
                 temperature = 0.5)

Abstract_Retriever = SelfQueryRetriever.from_llm(
    llm,
    Abstract_Store,
    document_content_description,
    metadata_field_info,
    verbose=True,
    enable_limit=True,
)

Content_Retriever = SelfQueryRetriever.from_llm(
    llm,
    Content_Store,
    document_content_description,
    metadata_field_info,
    verbose=True,
    enable_limit=True,
)

# 3. Generation

In [8]:
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})]


In [18]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

abstract_chain = (
    {"context": Abstract_Retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

content_chain = (
    {"context": Content_Retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

def choose_route(result):
    if "abstract_store" in result.datasource.lower():
        print('here')
        return abstract_chain
    elif "content_store" in result.datasource.lower():
        return content_chain


from langchain_core.runnables import RunnableLambda

full_chain = router | RunnableLambda(choose_route)

In [19]:
answer = full_chain.invoke({"question": "综合总结2020里边nlp相关的事件以及发展"})

2024-12-01 17:52:41 - INFO - Sending request to Red Pill AI: {'model': 'gpt-4o', 'messages': [{'role': 'system', 'content': 'You are an expert at routing a user question to the appropriate data source.\n\nBased on the programming language the question is referring to, route it to the relevant data source.'}, {'role': 'user', 'content': '综合总结2020里边nlp相关的事件以及发展'}], 'temperature': 0, 'tools': [{'type': 'function', 'function': {'name': 'RouteQuery', 'description': 'Route a user query to the most relevant datasource.', 'parameters': {'properties': {'datasource': {'description': 'Abstract_Store is a database with abstracts of papers in the natural language field, Content_Store is a databnase with the full text of papers in the natural language field. Given a user question choose which datasource would be most relevant for answering their question. For Summarization or more general use cases, route to Abstract_Store, only if asked on concepts or specific content route to Content_Store.', 'enu

here


2024-12-01 17:52:49 - INFO - Generated Query: query=' ' filter=None limit=None


In [20]:
answer

'The context provided from the "Abstract_Store" does not contain specific information directly related to a question about "datasource." The documents primarily discuss semantic search in natural language processing, covering topics like its importance, challenges, technologies involved, and future trends. If you have a specific question related to this context, please provide more details so I can assist you better.'

In [12]:
# question = """ Summarize advancements in natural language processing in 2020
# """
# router.invoke({"question": question})

# # we need to add "the concept for it to get the correct answer"
# question = """ Tell me about the concept Task Decomposition
# """
# result = router.invoke({"question": question})
# result.datasource

Generation

In [13]:
querry = '综合总结2020里边nlp相关的事件以及发展'

docs = retriever.invoke(querry)
for doc in docs:
    print(doc.metadata['title'])

answer = rag_chain.invoke(querry)
print(answer)

NameError: name 'retriever' is not defined