# Features
- retrieval routing
- metadata filter on year


In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

True

## 1. Embedding

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


# 2. Indexing

In [3]:
# Load data

import json
from langchain.schema import Document

datapath = '../data/data_info.txt'

with open(datapath, "r") as file:
    raw_data = file.read()

corpus = json.loads(raw_data)
corpus[0].keys()

dict_keys(['title', 'abstract', 'keywords', 'year', 'doi', 'authors', 'full text', 'pages', 'content'])

In [4]:
# Preprocess and split data
from langchain.text_splitter import RecursiveCharacterTextSplitter

Abstract_Store = []

for thesis in corpus:
    document = Document(
    page_content=thesis['abstract'],
    metadata={
        "title": thesis['title'],
        "year": thesis['year'],
    })
    Abstract_Store.append(document)

print(len(Abstract_Store))
print(Abstract_Store[0].metadata)

Content_Store = []

for thesis in corpus:
    document = Document(
    page_content=thesis['content'],
    metadata={
        "title": thesis['title'],
        "year": thesis['year'],
    })
    Content_Store.append(document)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
splits = text_splitter.split_documents(Content_Store)
print(len(splits))

100
{'title': 'A Critical Survey on the use of Fuzzy Sets in Speech and Natural Language Processing', 'year': 2012}
4716


In [5]:
# Store the data
from langchain_community.vectorstores import Chroma

Abstract_Store = Chroma.from_documents(documents=Abstract_Store, embedding=embedder)
#Content_Store = Chroma.from_documents(documents=splits, embedding=embedder)

# 3.Routing - logical routing

Routing reference [here](https://python.langchain.com/v0.1/docs/use_cases/query_analysis/techniques/routing/)

In [6]:
from typing import Literal

from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from tools.custom_chat_model import RedPillChatModel

# Data model
class RouteQuery(BaseModel):
    """Route a user query to the most relevant datasource."""

    datasource: Literal["Abstract_Store", "Content_Store"] = Field(
        ...,
        description="Abstract_Store is a database with abstracts of papers in the natural language field, Content_Store is a database with the full text of papers in the natural language field. Given a user question choose which datasource would be most relevant for answering their question. For Summarization or more general use cases, route to Abstract_Store, only if asked on concepts or specific content route to Content_Store.",
    )

# LLM with function call 
llm = RedPillChatModel(model="gpt-4o", 
                 api_key=os.getenv("RED_PILL_API_KEY"),
                 temperature = 0)
routing_llm = llm.with_structured_output(RouteQuery)

# Prompt 
system = """You are an expert at routing a user question to the appropriate data source."""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

# Define router 
router = prompt | routing_llm

In [None]:
answer = router.invoke({"question": "summarize advancements in the field natural language processing on the year 2020"})

2024-12-02 11:32:04 - INFO - Sending request to Red Pill AI: {'model': 'gpt-4o', 'messages': [{'role': 'system', 'content': 'You are an expert at routing a user question to the appropriate data source.'}, {'role': 'user', 'content': 'summarize advancements in the field natural language processing on the year 2020'}], 'temperature': 0, 'tools': [{'type': 'function', 'function': {'name': 'RouteQuery', 'description': 'Route a user query to the most relevant datasource.', 'parameters': {'properties': {'datasource': {'description': 'Abstract_Store is a database with abstracts of papers in the natural language field, Content_Store is a database with the full text of papers in the natural language field. Given a user question choose which datasource would be most relevant for answering their question. For Summarization or more general use cases, route to Abstract_Store, only if asked on concepts or specific content route to Content_Store.', 'enum': ['Abstract_Store', 'Content_Store'], 'type':



In [8]:
answer.datasource

'Abstract_Store'

# 4. Self Querying Retrieval

self querying retrieval reference [here](https://python.langchain.com/docs/how_to/self_query/) and [here](https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_10_and_11.ipynb)

In [10]:
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from tools.customllm import RedPillLLM


metadata_field_info = [
    AttributeInfo(
        name="title",
        description="The title of the thesis",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year the thesis was published",
        type="integer",
    ),
    AttributeInfo(
        name="abstract",
        description="The abstract of the thesis",
        type="integer",
    ),
]
document_content_description = "Thesis in the natural language processing field"

llm = RedPillLLM(model="gpt-4o", 
                 api_key=os.getenv("RED_PILL_API_KEY"),
                 temperature = 0.5)

Abstract_Retriever = SelfQueryRetriever.from_llm(
    llm,
    Abstract_Store,
    document_content_description,
    metadata_field_info,
    verbose=True,
    enable_limit=True,
)

# Content_Retriever = SelfQueryRetriever.from_llm(
#     llm,
#     Content_Store,
#     document_content_description,
#     metadata_field_info,
#     verbose=True,
#     enable_limit=True,
# )

In [12]:
Abstract_Retriever.invoke({"question":"give me developments in natural language processing field in 2020"})

2024-12-02 11:32:48 - INFO - Generated Query: query='developments in natural language processing' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2020) limit=None


[Document(metadata={'title': 'Challenges and opportunities for public health made possible by advances in natural language processing', 'year': 2020}, page_content='Natural language processing (NLP) is a subfield of artificial intelligence devoted to understanding and generation of language. The recent advances in NLP technologies are enabling rapid analysis of vast amounts of text, thereby creating opportunities for health research and evidence-informed decision making. The analysis and data extraction from scientific literature, technical reports, health records, social media, surveys, registries and other documents can support core public health functions including the enhancement of existing surveillance systems (e.g. through faster identification of diseases and risk factors/at-risk populations), disease prevention strategies (e.g. through more efficient evaluation of the safety and effectiveness of interventions) and health promotion efforts (e.g. by providing the ability to obta

# 3. Generation

In [13]:
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})]


In [16]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

abstract_chain = (
    {"context": Abstract_Retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# content_chain = (
#     {"context": Content_Retriever, "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )

def choose_route(result):
    if "abstract_store" in result.datasource.lower():
        return abstract_chain
    elif "content_store" in result.datasource.lower():
        return 'content_chain'


from langchain_core.runnables import RunnableLambda

full_chain = router | RunnableLambda(choose_route)

In [17]:
answer = full_chain.invoke({"question": "advancements in natural language processing field in 2020"})

2024-12-02 11:33:59 - INFO - Sending request to Red Pill AI: {'model': 'gpt-4o', 'messages': [{'role': 'system', 'content': 'You are an expert at routing a user question to the appropriate data source.'}, {'role': 'user', 'content': 'advancements in natural language processing field in 2020'}], 'temperature': 0, 'tools': [{'type': 'function', 'function': {'name': 'RouteQuery', 'description': 'Route a user query to the most relevant datasource.', 'parameters': {'properties': {'datasource': {'description': 'Abstract_Store is a database with abstracts of papers in the natural language field, Content_Store is a database with the full text of papers in the natural language field. Given a user question choose which datasource would be most relevant for answering their question. For Summarization or more general use cases, route to Abstract_Store, only if asked on concepts or specific content route to Content_Store.', 'enum': ['Abstract_Store', 'Content_Store'], 'type': 'string'}}, 'required'

In [18]:
answer

'Based on the provided context, the document titled "LLaMA: Open and Efficient Foundation Language Models" from 2023 discusses LLaMA, a collection of foundation language models. These models range from 7 billion to 65 billion parameters and are trained on trillions of tokens using publicly available datasets. The LLaMA-13B model outperforms GPT-3 on most benchmarks, and LLaMA-65B is competitive with other leading models like Chinchilla-70B and PaLM-540B. The models are released to the research community.'

In [None]:
# question = """ Summarize advancements in natural language processing in 2020
# """
# router.invoke({"question": question})

# # we need to add "the concept for it to get the correct answer"
# question = """ Tell me about the concept Task Decomposition
# """
# result = router.invoke({"question": question})
# result.datasource

Generation

In [None]:
querry = '综合总结2020里边nlp相关的事件以及发展'

docs = retriever.invoke(querry)
for doc in docs:
    print(doc.metadata['title'])

answer = rag_chain.invoke(querry)
print(answer)