In [1]:
from langchain_community.document_loaders import PyPDFLoader
### Construct retriever ###
loader = PyPDFLoader("attention.pdf")
docs = loader.load()

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [4]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits, embedding=OllamaEmbeddings())
retriever = vectorstore.as_retriever()

In [5]:
from langchain_community.llms import Ollama
llm=Ollama(model="llama2")

In [6]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
### Contextualize question ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [7]:
from langchain.chains import create_history_aware_retriever
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [8]:
from langchain.chains.combine_documents import create_stuff_documents_chain
### Answer question ###
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

In [9]:
from langchain.chains import create_retrieval_chain
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [10]:
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

In [11]:
### Statefully manage chat history ###
store = {}

In [12]:
from langchain_core.runnables.history import RunnableWithMessageHistory

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [13]:
conversational_rag_chain.invoke(
    {"input": "What is Self-Attention?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

Parent run b8ac40bb-2010-4178-8b98-6bc26eb16ef3 not found for run 168b13b0-35a2-4cd9-a02d-6773549b06ea. Treating as a root run.


'Self-attention is a mechanism in neural networks that allows them to focus on specific parts of the input when processing it. It was introduced in the Transformer architecture, which is a simple network architecture based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Self-attention allows the model to weight different parts of the input equally, rather than relying on a fixed context or recurrence. This allows the model to capture long-range dependencies in the input and to handle variable-length inputs. In the context of machine translation, self-attention can help the model to better understand the relationships between words in a sentence and to generate more accurate translations.'

In [14]:
conversational_rag_chain.invoke(
    {"input": "What are common ways of doing it?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

Parent run 960da9fe-e2a6-477b-a12a-25e1ab22e861 not found for run c90e00a9-52aa-4c22-a58c-2f2d39f0e1d1. Treating as a root run.


'There are several common ways to do self-attention in neural networks:\n\n1. Multi-head attention: This is a variation of self-attention where the input is split into multiple attention heads, each with its own weight matrix. The outputs of these heads are then combined to form the final output.\n2. Attention masking: This involves adding a mask to the input that indicates which parts of the input should be attended to and which should not.\n3. Positional encoding: This involves adding a fixed vector to each input element based on its position in the sequence. This allows the model to differentiate between elements in the sequence even if they have the same content.\n4. Layer normalization: This involves normalizing the activations of each layer in the network, which can help the model to focus on the relevant parts of the input.\n5. Bidirectional encoding: This involves processing the input sequence in both the forward and backward directions, allowing the model to capture both local

In [15]:
conversational_rag_chain.invoke(
    {"input": "What am I primarily discussing about?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

Parent run 718c30be-26f7-408c-93df-ee87275250e8 not found for run cd495eea-1375-4baa-9fee-b114f32f36fb. Treating as a root run.


'You are primarily discussing self-attention in neural networks, specifically in the context of machine translation.'

In [19]:
get_session_history("abc123")

InMemoryChatMessageHistory(messages=[HumanMessage(content='What is Self-Attention?'), AIMessage(content='Self-attention is a mechanism in neural networks that allows them to focus on specific parts of the input when processing it. It was introduced in the Transformer architecture, which is a simple network architecture based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Self-attention allows the model to weight different parts of the input equally, rather than relying on a fixed context or recurrence. This allows the model to capture long-range dependencies in the input and to handle variable-length inputs. In the context of machine translation, self-attention can help the model to better understand the relationships between words in a sentence and to generate more accurate translations.'), HumanMessage(content='What are common ways of doing it?'), AIMessage(content='There are several common ways to do self-attention in neural networks:\n\n1. Mul