In [26]:
import os
import getpass
from langchain import hub
from langchain_chroma import Chroma
from langchain_cohere import ChatCohere, CohereEmbeddings
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import HumanMessage, AIMessage
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

In [2]:
os.environ['COHERE_API_KEY'] = getpass.getpass()

 ········


In [3]:
cohere_api_key = os.environ['COHERE_API_KEY']

In [4]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = getpass.getpass()

 ········


In [5]:
loader = PDFPlumberLoader('https://arxiv.org/pdf/2405.10725')
docs = loader.load()

In [6]:
# total 12 documents
print(docs)

[Document(page_content='INDUS: Effective and Efficient Language Models for Scientific Applications\nBishwaranjanBhattacharjee1,AashkaTrivedi1,MasayasuMuraoka1,\nMuthukumaranRamasubramanian3,TakumaUdagawa1,IkshaGurung3,RongZhang1,\nBharathDandala1,RahulRamachandran2,ManilMaskey2,KaylinBugbee2,MikeLittle4,\nElizabethFancher2,LaurenSanders5,SylvainCostes5,SergiBlanco-Cuaresma6,KellyLockhart6,\nThomasAllen6,FelixGrezes6,MeganAnsdell7,AlbertoAccomazzi6,YousefEl-Kurdi1,\nDavisWertheimer1,BirgitPfitzmann1,CesarBerrospiRamis1,MicheleDolfi1,RafaelTeixeiradeLima1,\nPanagiotisVagenas1,S.KarthikMukkavilli1,PeterStaar1,SanazVahidinia7,RyanMcGranaghan8,\nArminMehrabian9,TsendgarLee7\n1IBMResearchAI,2 NASAMFSC,3 UAH,4 Navteca,5 NASAAmes,6 Harvard-SmithsonianCfA,\n7 NASAHQ,8 JPL,9 NASAGSFC\nAbstract generation tasks. Most popular LLMs rely on the\ntransformer architecture (Vaswani et al., 2017)\nLargelanguagemodels(LLMs)trainedongen- and are trained using general-purpose corpora\neral domain corpora s

In [7]:
print(docs[0].page_content)

INDUS: Effective and Efficient Language Models for Scientific Applications
BishwaranjanBhattacharjee1,AashkaTrivedi1,MasayasuMuraoka1,
MuthukumaranRamasubramanian3,TakumaUdagawa1,IkshaGurung3,RongZhang1,
BharathDandala1,RahulRamachandran2,ManilMaskey2,KaylinBugbee2,MikeLittle4,
ElizabethFancher2,LaurenSanders5,SylvainCostes5,SergiBlanco-Cuaresma6,KellyLockhart6,
ThomasAllen6,FelixGrezes6,MeganAnsdell7,AlbertoAccomazzi6,YousefEl-Kurdi1,
DavisWertheimer1,BirgitPfitzmann1,CesarBerrospiRamis1,MicheleDolfi1,RafaelTeixeiradeLima1,
PanagiotisVagenas1,S.KarthikMukkavilli1,PeterStaar1,SanazVahidinia7,RyanMcGranaghan8,
ArminMehrabian9,TsendgarLee7
1IBMResearchAI,2 NASAMFSC,3 UAH,4 Navteca,5 NASAAmes,6 Harvard-SmithsonianCfA,
7 NASAHQ,8 JPL,9 NASAGSFC
Abstract generation tasks. Most popular LLMs rely on the
transformer architecture (Vaswani et al., 2017)
Largelanguagemodels(LLMs)trainedongen- and are trained using general-purpose corpora
eral domain corpora showed remarkable re-
like Wikipedia or

In [8]:
print(docs[0].metadata)

{'source': 'https://arxiv.org/pdf/2405.10725', 'file_path': 'https://arxiv.org/pdf/2405.10725', 'page': 0, 'total_pages': 12, 'Author': '', 'CreationDate': 'D:20240522001021Z', 'Creator': 'LaTeX with hyperref', 'Keywords': '', 'ModDate': 'D:20240522001021Z', 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'Producer': 'pdfTeX-1.40.25', 'Subject': '', 'Title': '', 'Trapped': 'False'}


In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200, add_start_index = True)
doc_splits = text_splitter.split_documents(documents=docs)

In [10]:
# total 57 chunks
print(doc_splits)

[Document(page_content='INDUS: Effective and Efficient Language Models for Scientific Applications\nBishwaranjanBhattacharjee1,AashkaTrivedi1,MasayasuMuraoka1,\nMuthukumaranRamasubramanian3,TakumaUdagawa1,IkshaGurung3,RongZhang1,\nBharathDandala1,RahulRamachandran2,ManilMaskey2,KaylinBugbee2,MikeLittle4,\nElizabethFancher2,LaurenSanders5,SylvainCostes5,SergiBlanco-Cuaresma6,KellyLockhart6,\nThomasAllen6,FelixGrezes6,MeganAnsdell7,AlbertoAccomazzi6,YousefEl-Kurdi1,\nDavisWertheimer1,BirgitPfitzmann1,CesarBerrospiRamis1,MicheleDolfi1,RafaelTeixeiradeLima1,\nPanagiotisVagenas1,S.KarthikMukkavilli1,PeterStaar1,SanazVahidinia7,RyanMcGranaghan8,\nArminMehrabian9,TsendgarLee7\n1IBMResearchAI,2 NASAMFSC,3 UAH,4 Navteca,5 NASAAmes,6 Harvard-SmithsonianCfA,\n7 NASAHQ,8 JPL,9 NASAGSFC\nAbstract generation tasks. Most popular LLMs rely on the\ntransformer architecture (Vaswani et al., 2017)\nLargelanguagemodels(LLMs)trainedongen- and are trained using general-purpose corpora\neral domain corpora s

In [11]:
vectorstore = Chroma.from_documents(documents=doc_splits, embedding=CohereEmbeddings())
retriever = vectorstore.as_retriever()

In [12]:
model = ChatCohere(model='command-r')

In [13]:
prompt = hub.pull('rlm/rag-prompt')

In [14]:
output_parser = StrOutputParser()

In [15]:
def format_docs(docs):
    return '\n\n'.join(doc.page_content for doc in docs)

format_docs(docs)

'INDUS: Effective and Efficient Language Models for Scientific Applications\nBishwaranjanBhattacharjee1,AashkaTrivedi1,MasayasuMuraoka1,\nMuthukumaranRamasubramanian3,TakumaUdagawa1,IkshaGurung3,RongZhang1,\nBharathDandala1,RahulRamachandran2,ManilMaskey2,KaylinBugbee2,MikeLittle4,\nElizabethFancher2,LaurenSanders5,SylvainCostes5,SergiBlanco-Cuaresma6,KellyLockhart6,\nThomasAllen6,FelixGrezes6,MeganAnsdell7,AlbertoAccomazzi6,YousefEl-Kurdi1,\nDavisWertheimer1,BirgitPfitzmann1,CesarBerrospiRamis1,MicheleDolfi1,RafaelTeixeiradeLima1,\nPanagiotisVagenas1,S.KarthikMukkavilli1,PeterStaar1,SanazVahidinia7,RyanMcGranaghan8,\nArminMehrabian9,TsendgarLee7\n1IBMResearchAI,2 NASAMFSC,3 UAH,4 Navteca,5 NASAAmes,6 Harvard-SmithsonianCfA,\n7 NASAHQ,8 JPL,9 NASAGSFC\nAbstract generation tasks. Most popular LLMs rely on the\ntransformer architecture (Vaswani et al., 2017)\nLargelanguagemodels(LLMs)trainedongen- and are trained using general-purpose corpora\neral domain corpora showed remarkable re-\nl

In [16]:
chain = (
    {'context' : retriever | format_docs, 'question' : RunnablePassthrough()}
    | prompt
    | model
    | output_parser
)

In [17]:
chain

{
  context: VectorStoreRetriever(tags=['Chroma', 'CohereEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x162ec4590>)
           | RunnableLambda(format_docs),
  question: RunnablePassthrough()
}
| ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])
| ChatCohere(client=<cohere.client.Client object at 0x162e8e510>, async_client=<cohere.client.AsyncClient object at 0x162ec4b90>, model='command-r', cohere_api_key=SecretStr('**********'))
| StrOutputParser()

In [18]:
chain.invoke('Can you brief me the summary of this research paper in a very plain english which is easy to understand ?')

'The research focuses on developing domain-specific models for use in the science domain, namely INDUS. This collection of encoder-based LLMs aims to improve natural language understanding in various scientific fields. According to the benchmark results, INDUS performs better than comparable models, with strong results also for its smaller version, INDUS SMALL.'

Now, I have a follow-up question to understand about encoder models, LLM, everything about INDUS. How did it prove to be better than the other models?

But for this, it would be ideal if the model knows about the conversation history so it can boil down the exact question very precisely and become time efficient.

Thus let's add a chat history to this RAG Chain.

##### 2 things : 
##### a) contextualize input question based on chat history
##### b) add chat history to prompt

In [19]:
contextualize_q_system_prompt = "Based on the given chat history and the most recent user question which\
could reference context in the chat history, create a standalone question which could be understood without\
the chat history. Do not answer the question. Just reformulate it if needed otherwise return it as it is."

contextualize_q_prompt = ChatPromptTemplate.from_messages([
    ('system', contextualize_q_system_prompt),
    MessagesPlaceholder('chat_history'),
    ('human', '{input}')
])

history_aware_retriever = create_history_aware_retriever(model, retriever, contextualize_q_prompt)

In [20]:
qa_system_prompt = "You are an assistant for question answering tasks. Use the retrieved context to answer\
the user input. If you don't know the answer, just say you do not know.\
{context}"

qa_prompt = ChatPromptTemplate.from_messages([
    ('system', qa_system_prompt),
    MessagesPlaceholder('chat_history'),
    ('human', '{input}')
])

question_answer_chain = create_stuff_documents_chain(model, qa_prompt)

In [21]:
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [22]:
chat_history = []
chat_history.extend([
    HumanMessage(content='Can you brief me the summary of this research paper in a very plain english which is easy to understand ?'),
    AIMessage(content='This paper introduces INDUS, a family of encoder-based LLMs designed for use in the science domain. It compares their performance with other models and finds that INDUS outperformed them all. The researchers also created smaller versions of the models for resource-constrained applications.')
])

chat_history

[HumanMessage(content='Can you brief me the summary of this research paper in a very plain english which is easy to understand ?'),
 AIMessage(content='This paper introduces INDUS, a family of encoder-based LLMs designed for use in the science domain. It compares their performance with other models and finds that INDUS outperformed them all. The researchers also created smaller versions of the models for resource-constrained applications.')]

In [23]:
question = "what are encoder models?"
response = rag_chain.invoke({'input' : question, 'chat_history' : chat_history})
chat_history.extend([
    HumanMessage(content=question),
    AIMessage(content=response['answer'])
])
chat_history

[HumanMessage(content='Can you brief me the summary of this research paper in a very plain english which is easy to understand ?'),
 AIMessage(content='This paper introduces INDUS, a family of encoder-based LLMs designed for use in the science domain. It compares their performance with other models and finds that INDUS outperformed them all. The researchers also created smaller versions of the models for resource-constrained applications.'),
 HumanMessage(content='what are encoder models?'),
 AIMessage(content='Encoder models are a type of neural network that process sequential data such as text, speech, or time series data. They work by taking input data and producing a fixed-size vector representation, known as an embedding, that captures the semantics of the input. This embedding can then be used as a concise summary of the input sequence, enabling various natural language processing tasks.\n\nIn the context of text, an encoder model reads a sentence or a passage and produces a comp

In [24]:
question = "what is an encoder based LLM? Tell me about its architecture."
response = rag_chain.invoke({'input' : question, 'chat_history' : chat_history})
chat_history.extend([
    HumanMessage(content=question),
    AIMessage(content=response['answer'])
])
chat_history

[HumanMessage(content='Can you brief me the summary of this research paper in a very plain english which is easy to understand ?'),
 AIMessage(content='This paper introduces INDUS, a family of encoder-based LLMs designed for use in the science domain. It compares their performance with other models and finds that INDUS outperformed them all. The researchers also created smaller versions of the models for resource-constrained applications.'),
 HumanMessage(content='what are encoder models?'),
 AIMessage(content='Encoder models are a type of neural network that process sequential data such as text, speech, or time series data. They work by taking input data and producing a fixed-size vector representation, known as an embedding, that captures the semantics of the input. This embedding can then be used as a concise summary of the input sequence, enabling various natural language processing tasks.\n\nIn the context of text, an encoder model reads a sentence or a passage and produces a comp

In [25]:
response['context']

[Document(page_content='transformer architecture (Vaswani et al., 2017)\nLargelanguagemodels(LLMs)trainedongen- and are trained using general-purpose corpora\neral domain corpora showed remarkable re-\nlike Wikipedia or CommonCrawl (Devlin et al.,\nsults on natural language processing (NLP)\n2019; Liu et al., 2019; Lewis et al., 2020; Raffel\ntasks. However, previous research demon-\net al., 2020; Brown et al., 2020; Touvron et al.,\nstratedLLMstrainedusingdomain-focusedcor-\npora perform better on specialized tasks. In- 2023). Althoughthesegeneral-purposemodelsex-\nspired by this pivotal insight, we developed hibitedstrongperformance,thedistributionalshift\nINDUS,acomprehensivesuiteofLLMstailored ofvocabularyledtosub-optimalperformanceon\nfortheEarthscience,biology,physics,helio- domain-specific natural language understanding\nphysics, planetary sciences and astrophysics\nand generation tasks (Beltagy et al., 2019). Fol-\ndomains and trained using curated scientific\nlowing this obser

Now we can see that it's easy to ask follow-up questions to our assistant by attaching a conversational chat history for it to refer before giving an answer

But one problem with this system is the effort needed to attach each question-answer response to this chat history after each invocation call. 

So to counter this, we need to modify it in a way that chat history gets updated after every invocation automatically.

##### Adding message history using RunnableWithMessageHistory to store messages based on session
##### For now, storing it in-memory using a dictionary with session id and an instance of BaseChatMessageHistory

In [27]:
store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key='input',
    history_messages_key='chat_history',
    output_messages_key='answer'
)

In [30]:
conversational_rag_chain.input_schema.schema()

{'title': 'RunnableWithChatHistoryInput',
 'type': 'object',
 'properties': {'input': {'title': 'Input',
   'anyOf': [{'type': 'string'},
    {'$ref': '#/definitions/BaseMessage'},
    {'type': 'array', 'items': {'$ref': '#/definitions/BaseMessage'}}]}},
 'required': ['input'],
 'definitions': {'BaseMessage': {'title': 'BaseMessage',
   'description': 'Base abstract Message class.\n\nMessages are the inputs and outputs of ChatModels.',
   'type': 'object',
   'properties': {'content': {'title': 'Content',
     'anyOf': [{'type': 'string'},
      {'type': 'array',
       'items': {'anyOf': [{'type': 'string'}, {'type': 'object'}]}}]},
    'additional_kwargs': {'title': 'Additional Kwargs', 'type': 'object'},
    'response_metadata': {'title': 'Response Metadata', 'type': 'object'},
    'type': {'title': 'Type', 'type': 'string'},
    'name': {'title': 'Name', 'type': 'string'},
    'id': {'title': 'Id', 'type': 'string'}},
   'required': ['content', 'type']}}}

In [31]:
conversational_rag_chain.output_schema.schema()

{'title': 'RunnableSequenceOutput',
 'type': 'object',
 'properties': {'chat_history': {'title': 'Chat History',
   'type': 'array',
   'items': {'anyOf': [{'$ref': '#/definitions/AIMessage'},
     {'$ref': '#/definitions/HumanMessage'},
     {'$ref': '#/definitions/ChatMessage'},
     {'$ref': '#/definitions/SystemMessage'},
     {'$ref': '#/definitions/FunctionMessage'},
     {'$ref': '#/definitions/ToolMessage'}]}},
  'input': {'title': 'Input', 'type': 'string'},
  'answer': {'title': 'Answer', 'type': 'string'}},
 'definitions': {'ToolCall': {'title': 'ToolCall',
   'type': 'object',
   'properties': {'name': {'title': 'Name', 'type': 'string'},
    'args': {'title': 'Args', 'type': 'object'},
    'id': {'title': 'Id', 'type': 'string'}},
   'required': ['name', 'args', 'id']},
  'InvalidToolCall': {'title': 'InvalidToolCall',
   'type': 'object',
   'properties': {'name': {'title': 'Name', 'type': 'string'},
    'args': {'title': 'Args', 'type': 'string'},
    'id': {'title': 'Id

In [None]:
conversational_rag_chain.invoke(
    {'input': 'Summarize the research paper for me.'},
    config={
        'configurable': {
            'session_id': 
        }
    }
)