# Features
- retrieval routing
- metadata filter on year


In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

True

## 1. Embedding

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


# 2. Indexing

In [3]:
# Load data

import json
from langchain.schema import Document

datapath = '../data/data_info.txt'

with open(datapath, "r") as file:
    raw_data = file.read()

corpus = json.loads(raw_data)
corpus[0].keys()

dict_keys(['title', 'abstract', 'keywords', 'year', 'doi', 'authors', 'full text', 'pages', 'content'])

In [4]:
# Preprocess and split data
from langchain.text_splitter import RecursiveCharacterTextSplitter

Abstract_Store = []

for thesis in corpus:
    document = Document(
    page_content=thesis['abstract'],
    metadata={
        "title": thesis['title'],
        "year": thesis['year'],
    })
    Abstract_Store.append(document)

print(len(Abstract_Store))
print(Abstract_Store[0].metadata)

Content_Store = []

for thesis in corpus:
    document = Document(
    page_content=thesis['content'],
    metadata={
        "title": thesis['title'],
        "year": thesis['year'],
    })
    Content_Store.append(document)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
splits = text_splitter.split_documents(Content_Store)
print(len(splits))

100
{'title': 'A Critical Survey on the use of Fuzzy Sets in Speech and Natural Language Processing', 'year': 2012}
4716


In [5]:
# Store the data
from langchain_community.vectorstores import Chroma

Abstract_Store = Chroma.from_documents(documents=Abstract_Store, embedding=embedder, collection_name='abstract')
Content_Store = Chroma.from_documents(documents=splits, embedding=embedder, collection_name='content')

# 3.Routing - logical routing

Routing reference [here](https://python.langchain.com/v0.1/docs/use_cases/query_analysis/techniques/routing/)

In [6]:
from typing import Literal

from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from tools.custom_chat_model import RedPillChatModel

# Data model
class RouteQuery(BaseModel):
    """Route a user query to the most relevant datasource."""

    datasource: Literal["Abstract_Store", "Content_Store", "OTHER"] = Field(
        ...,
        description="Abstract_Store is a database with abstracts of papers in the natural language field, Content_Store is a database with the full text of papers in the natural language field. Given a user question choose which datasource would be most relevant for answering their question. For Summarization or more general use cases, route to Abstract_Store, only if asked on concepts or specific content route to Content_Store. Otherwise, if you encounter something wierd or not in the field of nlp, return OTHER",
    )

# LLM with function call 
llm = RedPillChatModel(model="gpt-4o", 
                 api_key=os.getenv("RED_PILL_API_KEY"),
                 temperature = 0)
routing_llm = llm.with_structured_output(RouteQuery)

# Prompt 
system = """You are an expert at routing a user question to the appropriate data source."""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

# Define router 
router = prompt | routing_llm

In [7]:
Summary = router.invoke({"question": "summarize advancements in the field natural language processing on the year 2020"})
print(Summary.datasource)
Content = router.invoke({"question": "Tell me about the transformer architecture in detail"})
print(Content.datasource)
Other = router.invoke({"question": "我想看你洗澡"})
print(Other.datasource)

2024-12-02 13:00:33 - INFO - Sending request to Red Pill AI: {'model': 'gpt-4o', 'messages': [{'role': 'system', 'content': 'You are an expert at routing a user question to the appropriate data source.'}, {'role': 'user', 'content': 'summarize advancements in the field natural language processing on the year 2020'}], 'temperature': 0, 'tools': [{'type': 'function', 'function': {'name': 'RouteQuery', 'description': 'Route a user query to the most relevant datasource.', 'parameters': {'properties': {'datasource': {'description': 'Abstract_Store is a database with abstracts of papers in the natural language field, Content_Store is a database with the full text of papers in the natural language field. Given a user question choose which datasource would be most relevant for answering their question. For Summarization or more general use cases, route to Abstract_Store, only if asked on concepts or specific content route to Content_Store. Otherwise, if you encounter something wierd or not in 

Abstract_Store


2024-12-02 13:00:41 - INFO - Sending request to Red Pill AI: {'model': 'gpt-4o', 'messages': [{'role': 'system', 'content': 'You are an expert at routing a user question to the appropriate data source.'}, {'role': 'user', 'content': '我想看你洗澡'}], 'temperature': 0, 'tools': [{'type': 'function', 'function': {'name': 'RouteQuery', 'description': 'Route a user query to the most relevant datasource.', 'parameters': {'properties': {'datasource': {'description': 'Abstract_Store is a database with abstracts of papers in the natural language field, Content_Store is a database with the full text of papers in the natural language field. Given a user question choose which datasource would be most relevant for answering their question. For Summarization or more general use cases, route to Abstract_Store, only if asked on concepts or specific content route to Content_Store. Otherwise, if you encounter something wierd or not in the field of nlp, return OTHER', 'enum': ['Abstract_Store', 'Content_Store

Content_Store




OTHER


# 4. Self Querying Retrieval

self querying retrieval reference [here](https://python.langchain.com/docs/how_to/self_query/) and [here](https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_10_and_11.ipynb)

In [8]:
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from tools.customllm import RedPillLLM


metadata_field_info = [
    AttributeInfo(
        name="title",
        description="The title of the thesis",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year the thesis was published",
        type="integer",
    ),
    AttributeInfo(
        name="abstract",
        description="The abstract of the thesis",
        type="integer",
    ),
]
document_content_description = "Thesis in the natural language processing field"

llm = RedPillLLM(model="gpt-4o", 
                 api_key=os.getenv("RED_PILL_API_KEY"),
                 temperature = 0.5)

Abstract_Retriever = SelfQueryRetriever.from_llm(
    llm,
    Abstract_Store,
    document_content_description,
    metadata_field_info,
    verbose=True,
    enable_limit=True,
)

Content_Retriever = SelfQueryRetriever.from_llm(
    llm,
    Content_Store,
    document_content_description,
    metadata_field_info,
    verbose=True,
    enable_limit=True,
)



In [9]:
Abstract_Retriever.invoke({"question":"give me developments in natural language processing field in 2020"})

2024-12-02 13:00:49 - INFO - Generated Query: query='developments in natural language processing' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2020) limit=None


[Document(metadata={'title': 'Challenges and opportunities for public health made possible by advances in natural language processing', 'year': 2020}, page_content='Natural language processing (NLP) is a subfield of artificial intelligence devoted to understanding and generation of language. The recent advances in NLP technologies are enabling rapid analysis of vast amounts of text, thereby creating opportunities for health research and evidence-informed decision making. The analysis and data extraction from scientific literature, technical reports, health records, social media, surveys, registries and other documents can support core public health functions including the enhancement of existing surveillance systems (e.g. through faster identification of diseases and risk factors/at-risk populations), disease prevention strategies (e.g. through more efficient evaluation of the safety and effectiveness of interventions) and health promotion efforts (e.g. by providing the ability to obta

In [10]:
Content_Retriever.invoke({"question":"Explain the concept: Task Decomposition"})

2024-12-02 13:00:53 - INFO - Generated Query: query='Task Decomposition' filter=None limit=None


[Document(metadata={'title': 'Multi-Task Learning in Natural Language Processing: An Overview', 'year': 2024}, page_content='61,73–75,78,80,89,105,111,113,115,120,134,145,146,151,158,159,159,160,163,165,169,\n170]. For example, to prevent large datasets from dominating training, Perera et al. [ 98] set the\nweightsas\nλt∝1\n|Dt|,\nwhere|Dt|denotes the size of the training dataset for task t. The weights can also be adjusted\ndynamically during the training process based on certain metrics. Through adjusting weights,\nwe can purposely emphasize different tasks in different training stages. For instance, since dy-\nnamically assigning smaller weights to more uncertain tasks usually leads to good performance\nfor MTL [ 19,62] assigns weights based on the homoscedasticity of training losses from different\nACMComput.Surv., Vol. 56,No. 12,Article 295.Publicationdate:July 2024.Multi-Task LearninginNatural Language Processing: AnOverview 295:11\ntasksas\nλt=1\n2σ2\nt,\nwhereσtmeasuresthevaria

# 3. Generation

In [11]:
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})]


In [12]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

abstract_chain = (
    {"context": Abstract_Retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

content_chain = (
    {"context": Content_Retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

def choose_route(result):
    if "abstract_store" in result.datasource.lower():
        return 'abstract_chain'
    elif "content_store" in result.datasource.lower():
        return 'content_chain'
    else:
        return 'The answer that you are looking for is not here :)'


from langchain_core.runnables import RunnableLambda

full_chain = router | RunnableLambda(choose_route)

In [13]:
query = "Give me 10 advancements in natural language processing field in 2020, answer in point form."
answer = full_chain.invoke({"question": query})

2024-12-02 13:00:53 - INFO - Sending request to Red Pill AI: {'model': 'gpt-4o', 'messages': [{'role': 'system', 'content': 'You are an expert at routing a user question to the appropriate data source.'}, {'role': 'user', 'content': 'Give me 10 advancements in natural language processing field in 2020, answer in point form.'}], 'temperature': 0, 'tools': [{'type': 'function', 'function': {'name': 'RouteQuery', 'description': 'Route a user query to the most relevant datasource.', 'parameters': {'properties': {'datasource': {'description': 'Abstract_Store is a database with abstracts of papers in the natural language field, Content_Store is a database with the full text of papers in the natural language field. Given a user question choose which datasource would be most relevant for answering their question. For Summarization or more general use cases, route to Abstract_Store, only if asked on concepts or specific content route to Content_Store. Otherwise, if you encounter something wierd

In [14]:
if answer == 'abstract_chain':
    docs  = Abstract_Retriever.invoke(query)
    print('_____Querrying Abstract Store_____')
    print('Here are the documents retrieved:')
    for doc in docs:
        print(doc.metadata['title'], doc.metadata['year'])

    response = abstract_chain.invoke(query)
    print(response)

elif answer == 'content_chain':
    docs  = Abstract_Retriever.invoke(query)
    print('_____Querrying Content Store_____')
    print('Here are the documents retrieved:')
    print(docs)
    print('\n')
    response = abstract_chain.invoke(query)
    print('_____The response from LLM_____')
    print(response)

else:
    print(answer)

2024-12-02 13:01:02 - INFO - Generated Query: query='advancements in natural language processing' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2020) limit=10


_____Querrying Abstract Store_____
Here are the documents retrieved:
Challenges and opportunities for public health made possible by advances in natural language processing 2020
Natural language processing (NLP) in management research: A literature review 2020
Transformers: State-of-the-Art Natural Language Processing 2020
A Constant Time Complexity Spam Detection Algorithm for Boosting Throughput on Rule-Based Filtering Systems 2020
Language Models are Few-Shot Learners 2020
Improving the Reliability of Deep Neural Networks in NLP: A Review 2020
Identifying the Machine Learning Techniques for Classification of Target Datasets 2020
LANGUAGE MODEL IS ALL YOU NEED: NATURAL LANGUAGE UNDERSTANDING AS QUESTION ANSWERING 2020
Searching Better Architectures for Neural Machine Translation 2020
A Stacking-based Ensemble Learning Method for Outlier Detection 2020


2024-12-02 13:01:07 - INFO - Generated Query: query='advancements in natural language processing' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2020) limit=10


Based on the provided context, here are advancements in the field of natural language processing (NLP) in 2020:

1. **Rapid Analysis of Text**: Advances in NLP technologies enabled the rapid analysis of vast amounts of text, facilitating health research and evidence-informed decision-making.

2. **Transformer Architectures**: The development of Transformer architectures allowed for building higher-capacity models, improving the effectiveness and efficiency of NLP tasks.

3. **Pretrained Models**: Pretraining on large corpora has enhanced the performance of NLP models, allowing them to perform a wide variety of tasks more effectively.

4. **Few-Shot Learning with GPT-3**: The introduction of GPT-3, a large-scale language model with 175 billion parameters, improved task-agnostic, few-shot performance, enabling the model to perform new language tasks with minimal examples.

5. **Adversarial Texts and Robustness**: Research focused on the vulnerability of deep neural networks to adversaria

In [15]:
query = "Explain transformers in detail"
answer = full_chain.invoke({"question": query})

2024-12-02 13:01:15 - INFO - Sending request to Red Pill AI: {'model': 'gpt-4o', 'messages': [{'role': 'system', 'content': 'You are an expert at routing a user question to the appropriate data source.'}, {'role': 'user', 'content': 'Explain transformers in detail'}], 'temperature': 0, 'tools': [{'type': 'function', 'function': {'name': 'RouteQuery', 'description': 'Route a user query to the most relevant datasource.', 'parameters': {'properties': {'datasource': {'description': 'Abstract_Store is a database with abstracts of papers in the natural language field, Content_Store is a database with the full text of papers in the natural language field. Given a user question choose which datasource would be most relevant for answering their question. For Summarization or more general use cases, route to Abstract_Store, only if asked on concepts or specific content route to Content_Store. Otherwise, if you encounter something wierd or not in the field of nlp, return OTHER', 'enum': ['Abstrac

In [16]:
if answer == 'abstract_chain':
    docs  = Abstract_Retriever.invoke(query)
    print('_____Querrying Abstract Store_____')
    print('Here are the documents retrieved:')
    for doc in docs:
        print(doc.metadata['title'], doc.metadata['year'])

    response = abstract_chain.invoke(query)
    print(response)

elif answer == 'content_chain':
    docs  = Abstract_Retriever.invoke(query)
    print('_____Querrying Content Store_____')
    print('Here are the documents retrieved:')
    print(docs)
    print('\n')
    response = abstract_chain.invoke(query)
    print('_____The response from LLM_____')
    print(response)

else:
    print(answer)

2024-12-02 13:01:23 - INFO - Generated Query: query='transformers' filter=None limit=None


_____Querrying Abstract Store_____
Here are the documents retrieved:
Transformers: State-of-the-Art Natural Language Processing 2020
BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding 2018
Attention in Natural Language Processing 2021
Attention Is All You Need 2017


2024-12-02 13:01:27 - INFO - Generated Query: query='transformers' filter=None limit=None


Transformers are a type of neural network architecture that have become fundamental in the field of natural language processing (NLP). They were introduced in the paper "Attention Is All You Need" and are characterized by their use of attention mechanisms without relying on recurrence or convolutions, which were common in previous models like recurrent neural networks (RNNs) and convolutional neural networks (CNNs).

### Key Features of Transformers:

1. **Attention Mechanism**: The core innovation of transformers is the self-attention mechanism, which allows the model to weigh the importance of different words in a sentence when encoding a particular word. This mechanism makes it possible to capture long-range dependencies and context in text more effectively than RNNs or CNNs.

2. **Parallelization**: Unlike RNNs, which process data sequentially, transformers process all input data simultaneously. This parallelization significantly reduces training time and allows transformers to sca

In [17]:
query = "我想看你洗澡"
answer = full_chain.invoke({"question": query})

2024-12-02 13:01:54 - INFO - Sending request to Red Pill AI: {'model': 'gpt-4o', 'messages': [{'role': 'system', 'content': 'You are an expert at routing a user question to the appropriate data source.'}, {'role': 'user', 'content': '我想看你洗澡'}], 'temperature': 0, 'tools': [{'type': 'function', 'function': {'name': 'RouteQuery', 'description': 'Route a user query to the most relevant datasource.', 'parameters': {'properties': {'datasource': {'description': 'Abstract_Store is a database with abstracts of papers in the natural language field, Content_Store is a database with the full text of papers in the natural language field. Given a user question choose which datasource would be most relevant for answering their question. For Summarization or more general use cases, route to Abstract_Store, only if asked on concepts or specific content route to Content_Store. Otherwise, if you encounter something wierd or not in the field of nlp, return OTHER', 'enum': ['Abstract_Store', 'Content_Store

In [18]:
if answer == 'abstract_chain':
    docs  = Abstract_Retriever.invoke(query)
    print('_____Querrying Abstract Store_____')
    print('Here are the documents retrieved:')
    for doc in docs:
        print(doc.metadata['title'], doc.metadata['year'])

    response = abstract_chain.invoke(query)
    print(response)

elif answer == 'content_chain':
    docs  = Abstract_Retriever.invoke(query)
    print('_____Querrying Content Store_____')
    print('Here are the documents retrieved:')
    print(docs)
    print('\n')
    response = abstract_chain.invoke(query)
    print('_____The response from LLM_____')
    print(response)

else:
    print(answer)

The answer that you are looking for is not here :)
