In [1]:
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai.chat_models import ChatOpenAI
from chromadb.api.models.Collection import Collection
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai.chat_models import ChatOpenAI
from langchain.vectorstores.chroma import Chroma
from langchain.document_loaders.pdf import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.memory import ConversationBufferMemory

In [2]:
def create_vector_database_from_pdf(pdf_path: str):
    """Creates a Vector Database on memory based on a pdf file and returns a Vector Store Retriever.

    Args:
        pdf_path: Path to pdf to convert into a Vector Database.

    Returns:
        VectorStoreRetriever with unstructured data.
    """

    # Load pdf and split it
    loader = PyPDFLoader(pdf_path)
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        length_function=len,
        chunk_overlap=50
    )
    documents = text_splitter.split_documents(data)

    # Create Vector Store with Embeddings
    embedding_openai = OpenAIEmbeddings(model = "text-embedding-ada-002")
    vector_store = Chroma.from_documents(
        documents = documents,
        embedding = embedding_openai
    )

    # vector_store.persist()
    retriever = vector_store.as_retriever(search_kwargs = {"k":4})
    return retriever

retriever = create_vector_database_from_pdf("./src/frontend/tmp/atenttion_is_all_you_need.pdf")

In [24]:
template = '''
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know.
Don't try to make up an answer. The context could be in any language, but you will always answer in the language of the user's question.
{context}

Question: {question}
Answer:
'''

llm = ChatOpenAI(
    model_name = "gpt-3.5-turbo",
    temperature = 0.0,
    streaming=True
)

# Initialise RetrievalQA Chain
chain = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type ="stuff",
    chain_type_kwargs = {
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question"],
        ),
    },
)

response = chain.invoke({"query": "What are transformers?"})

print(response) 

{'query': 'What are transformers?', 'result': 'Transformers are a type of sequence transduction model that rely entirely on self-attention to compute representations of input and output without using sequence-aligned RNNs or convolution. They replace the recurrent layers commonly used in encoder-decoder architectures with multi-headed self-attention, allowing for faster training on translation tasks compared to architectures based on recurrent or convolutional layers.', 'source_documents': [Document(page_content='The Transformer allows for significantly more parallelization and can reach a new state of the art in\ntranslation quality after being trained for as little as twelve hours on eight P100 GPUs.\n2 Background\nThe goal of reducing sequential computation also forms the foundation of the Extended Neural GPU\n[16], ByteNet [ 18] and ConvS2S [ 9], all of which use convolutional neural networks as basic building', metadata={'page': 1, 'source': './src/frontend/tmp/atenttion_is_all_yo

In [21]:
response = chain.invoke({"query": "Que son transformers?"})
print(response)

{'query': 'Que son transformers?', 'result': 'Los transformers son un tipo de modelo de transducción de secuencias basado completamente en atención, que reemplaza las capas recurrentes más comúnmente utilizadas en arquitecturas codificador-decodificador con auto-atención multi-cabeza.', 'source_documents': [Document(page_content='The Transformer allows for significantly more parallelization and can reach a new state of the art in\ntranslation quality after being trained for as little as twelve hours on eight P100 GPUs.\n2 Background\nThe goal of reducing sequential computation also forms the foundation of the Extended Neural GPU\n[16], ByteNet [ 18] and ConvS2S [ 9], all of which use convolutional neural networks as basic building', metadata={'page': 1, 'source': './src/frontend/tmp/atenttion_is_all_you_need.pdf'}), Document(page_content='language modeling tasks [34].\nTo the best of our knowledge, however, the Transformer is the first transduction model relying\nentirely on self-atten

In [16]:
chain.combine_documents_chain.llm_chain.prompt.messages

[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}")),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))]

In [20]:
from src.frontend._templates import template_customer_service

llm_question = ChatOpenAI(
    model_name = "gpt-3.5-turbo",
    temperature = 0.0,
    streaming=True
)

# Initialise RetrievalQA Chain
chain = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type ="stuff",
    chain_type_kwargs = {
        "prompt": PromptTemplate(
            template=template_customer_service,
            input_variables=["context", "question"],
        ),
    },
)

response = chain({"query": "What are transformers?"})

print(response) 

{'query': 'What are transformers?', 'result': 'Transformers are a type of sequence transduction model that rely entirely on self-attention to compute representations of input and output without using sequence-aligned RNNs or convolution. They replace the recurrent layers commonly used in encoder-decoder architectures with multi-headed self-attention, allowing for faster training on translation tasks compared to architectures based on recurrent or convolutional layers.', 'source_documents': [Document(page_content='The Transformer allows for significantly more parallelization and can reach a new state of the art in\ntranslation quality after being trained for as little as twelve hours on eight P100 GPUs.\n2 Background\nThe goal of reducing sequential computation also forms the foundation of the Extended Neural GPU\n[16], ByteNet [ 18] and ConvS2S [ 9], all of which use convolutional neural networks as basic building', metadata={'page': 1, 'source': './src/frontend/tmp/atenttion_is_all_yo

In [4]:
response = chain({"query": "Hi, who are you?"})

print(response) 

{'query': 'Hi, who are you?', 'result': 'Hello! I am a Customer Service Agent from HuggingFace. How can I assist you today?', 'source_documents': [Document(page_content='Zhou, and Yoshua Bengio. A structured self-attentive sentence embedding. arXiv preprint\narXiv:1703.03130 , 2017.\n[23] Minh-Thang Luong, Quoc V . Le, Ilya Sutskever, Oriol Vinyals, and Lukasz Kaiser. Multi-task\nsequence to sequence learning. arXiv preprint arXiv:1511.06114 , 2015.\n[24] Minh-Thang Luong, Hieu Pham, and Christopher D Manning. Effective approaches to attention-\nbased neural machine translation. arXiv preprint arXiv:1508.04025 , 2015.\n11', metadata={'page': 10, 'source': './src/frontend/tmp/atenttion_is_all_you_need.pdf'}), Document(page_content='[37] Vinyals & Kaiser, Koo, Petrov, Sutskever, and Hinton. Grammar as a foreign language. In\nAdvances in Neural Information Processing Systems , 2015.\n[38] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang\nMacherey, Maxim Kriku

In [22]:
memory = ConversationBufferMemory(memory_key="chat_history", output_key="answer", return_messages=True)

In [4]:
custom_template = """Given the following conversation and a follow-up message, \
rephrase the follow-up message to a stand-alone question or instruction that \
represents the user's intent, add all context needed if necessary to generate a complete and \
unambiguous question or instruction, only based on the history, don't make up messages. \
Maintain the same language as the follow up input message

Chat History:
{chat_history}

Follow Up Input: {question}
Standalone question or instruction:"""

In [17]:
ConversationalRetrievalChain.from_llm(
    llm=llm,
    verbose=True,
    memory=memory,
    return_generated_question=False,
    retriever=retriever,
    condense_question_llm = llm,
    #condense_question_prompt=PromptTemplate.from_template(custom_template),
    get_chat_history=lambda h : h
)

ConversationalRetrievalChain(memory=ConversationBufferMemory(output_key='answer', return_messages=True, memory_key='chat_history'), combine_docs_chain=StuffDocumentsChain(verbose=True, llm_chain=LLMChain(verbose=True, prompt=ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='You are a Customer Service Agent from the HuggingFace company and solve questions about the context provided. You can answer common questions\nas how are you and things that people use in normal conversations, but when you receive questions about products, services or something else you have \nto answer only based in the context provided, so you can introduce your self as Customer Service Agent from the HuggingFace company and only act like him.\nUse the following pieces of context to answer the user\'s question. \nIf you don\'t know the answer, just say that you don\'t know, don\'t try to make up

In [32]:
from langchain_core.language_models.base import BaseLanguageModel

Help on class ConversationalRetrievalChain in module langchain.chains.conversational_retrieval.base:

class ConversationalRetrievalChain(BaseConversationalRetrievalChain)
 |  ConversationalRetrievalChain(*, name: Optional[str] = None, memory: Optional[langchain_core.memory.BaseMemory] = None, callbacks: Union[List[langchain_core.callbacks.base.BaseCallbackHandler], langchain_core.callbacks.base.BaseCallbackManager, NoneType] = None, verbose: bool = None, tags: Optional[List[str]] = None, metadata: Optional[Dict[str, Any]] = None, callback_manager: Optional[langchain_core.callbacks.base.BaseCallbackManager] = None, combine_docs_chain: langchain.chains.combine_documents.base.BaseCombineDocumentsChain, question_generator: langchain.chains.llm.LLMChain, output_key: str = 'answer', rephrase_question: bool = True, return_source_documents: bool = False, return_generated_question: bool = False, get_chat_history: Optional[Callable[[List[Union[Tuple[str, str], langchain_core.messages.base.BaseMe

In [30]:
ChatOpenAI

langchain_openai.chat_models.base.ChatOpenAI

In [23]:
qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    verbose=True,
    memory=memory,
    return_generated_question=False,
    retriever=retriever,
    condense_question_llm = llm_question,
    #condense_question_prompt=PromptTemplate.from_template(custom_template),
    get_chat_history=lambda h : h
)

In [5]:
from langchain_openai.llms import OpenAI
from langchain.chains import LLMChain

In [69]:
from langchain.chains import StuffDocumentsChain

document_prompt = PromptTemplate(
    input_variables=["page_content"],
    template="{page_content}"
)

document_variable_name = "context"
llm = OpenAI()
# The prompt here should take as an input variable the
# `document_variable_name`
prompt = PromptTemplate.from_template(
    "Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}"
)
llm_chain = LLMChain(llm=llm, prompt=prompt)
combine_docs_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_prompt=document_prompt,
    document_variable_name=document_variable_name
)


In [46]:
qa.combine_docs_chain

StuffDocumentsChain(verbose=True, llm_chain=LLMChain(verbose=True, prompt=ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))]), llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x7f8b1dcf9850>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x7f8b1dcf9890>, temperature=0.0, openai_api_key=SecretStr('**********'), openai_proxy='', streaming=True)), document_variable_name='context')

In [29]:
qa.question_generator.prompt

PromptTemplate(input_variables=['chat_history', 'question'], template="Given the following conversation and a follow-up message, rephrase the follow-up message to a stand-alone question or instruction that represents the user's intent, add all context needed if necessary to generate a complete and unambiguous question or instruction, only based on the history, don't make up messages. Maintain the same language as the follow up input message. You are a Customer Service Agent from the HuggingFace company and solve questions about the context provided. You can answer common questions\nas how are you and things that people use in normal conversations, but when you receive questions about products, services or something else you have \nto answer only based in the context provided, so you can introduce your self as Customer Service Agent from the HuggingFace company.\n\nChat History:\n{chat_history}\n\nFollow Up Input: {question}\nStandalone question or instruction:")

In [10]:
combine_docs_template = """You are a Customer Service Agent from the HuggingFace company and solve questions about the context provided. You can answer common questions
as how are you and things that people use in normal conversations, but when you receive questions about products, services or something else you have 
to answer only based in the context provided, so you can introduce your self as Customer Service Agent from the HuggingFace company and only act like him.
Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"
"""

In [None]:
qa.combine_docs_chain.llm_chain.prompt.messages

In [11]:
qa.combine_docs_chain.llm_chain.prompt.template = combine_docs_template

ValueError: "ChatPromptTemplate" object has no field "template"

In [24]:
qa.combine_docs_chain.llm_chain.prompt.messages[0] = SystemMessagePromptTemplate.from_template(combine_docs_template)

In [25]:
results = qa.invoke({"question": "Hi, who are you?"})



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a Customer Service Agent from the HuggingFace company and solve questions about the context provided. You can answer common questions
as how are you and things that people use in normal conversations, but when you receive questions about products, services or something else you have 
to answer only based in the context provided, so you can introduce your self as Customer Service Agent from the HuggingFace company and only act like him.
Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------

Zhou, and Yoshua Bengio. A structured self-attentive sentence embedding. arXiv preprint
arXiv:1703.03130 , 2017.
[23] Minh-Thang Luong, Quoc V . Le, Ilya Sutskever, Oriol Vinyals, and Lukasz Kaiser. Multi-task
sequence to sequence 

In [15]:
results["answer"]

'I am a Customer Service Agent from the HuggingFace company. How can I assist you today?'