In [1]:
import os
import json

from langchain_core.messages import HumanMessage, SystemMessage

from langchain_groq.chat_models import ChatGroq
from langchain_ollama import ChatOllama

from dotenv import load_dotenv, find_dotenv

# Load the API keys from .env
load_dotenv(find_dotenv(), override=True)


from src.vectordb.create_vectordb import PinconeVectorDb

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
if os.getenv("USE_GROQ" == "no"):

    llm = ChatOllama(model=os.getenv('OLLAMA_CHAT_MODEL'), temperature=0.)

    llm_json = ChatOllama(model=os.getenv('OLLAMA_CHAT_MODEL'), temperature=0., format="json")
else:
    llm = ChatGroq(model="llama-3.1-70b-versatile",
                        stop_sequences="[end]",
                        temperature=0.)
    llm_json = ChatGroq(model="llama-3.1-70b-versatile",
                        stop_sequences="[end]",
                        temperature=0.)

In [3]:
from src.utils.prompts import router_instructions

test_vectorstore_llm_json = llm_json.invoke(
    [SystemMessage(content=router_instructions)]
    + [
        HumanMessage(
            content="what is rag?"
        )
    ]
)
test_web_search = llm_json.invoke(
    [SystemMessage(content=router_instructions)]
    + [
        HumanMessage(
            content="When were olymics 2024 held?"
        )
    ]
)
test_vectorstore_2 = llm_json.invoke(
    [SystemMessage(content=router_instructions)]
    + [
        HumanMessage(
            content="Retrieval augmented generation?"
        )
    ]
)
print(test_vectorstore_llm_json.content)
print(test_web_search.content)
print(test_vectorstore_2.content)

{"datasource": "vectorstore"}
{"datasource": "websearch"}
{"datasource": "vectorstore"}


In [4]:
from src.vectordb.create_vectordb import PinconeVectorDb, get_pinecone_retriever

pc = PinconeVectorDb()
pc.create_pinecone_index()

retriever = get_pinecone_retriever()
retriever

Index local-rag-agent already exists.
Index Stats:
{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 334}},
 'total_vector_count': 334}


VectorStoreRetriever(tags=['PineconeVectorStore', 'HuggingFaceEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x000001E693F31330>, search_type='mmr', search_kwargs={'k': 5})

In [5]:
question = "What is rag?"

docs = retriever.invoke(question)

print(len(docs))
print()
print(docs)

5

[Document(metadata={'Authors': 'Daniel Fleischer, Moshe Berchansky, Moshe Wasserblat, Peter Izsak', 'Published': '2024-08-05', 'Summary': 'Implementing Retrieval-Augmented Generation (RAG) systems is inherently\ncomplex, requiring deep understanding of data, use cases, and intricate design\ndecisions. Additionally, evaluating these systems presents significant\nchallenges, necessitating assessment of both retrieval accuracy and generative\nquality through a multi-faceted approach. We introduce RAG Foundry, an\nopen-source framework for augmenting large language models for RAG use cases.\nRAG Foundry integrates data creation, training, inference and evaluation into a\nsingle workflow, facilitating the creation of data-augmented datasets for\ntraining and evaluating large language models in RAG settings. This integration\nenables rapid prototyping and experimentation with various RAG techniques,\nallowing users to easily generate datasets and train RAG models using internal\nor specia

In [6]:
from langchain_core.documents import Document
def format_docs(docs: Document):
    return "\n\n".join(doc.page_content for doc in docs)

doc_text = format_docs(docs)
print(doc_text)

butions and any other form of processing needed
for a given evaluation.
See listing 4 for a configuration example; it con-
tains an answer processor that extracts an answer
from an output, and a list of metrics to run.
4
Experiments: RAG Tuning
To illustrate the usage and usefulness of the
RAG FOUNDRY library, we experiment with sev-
eral possible RAG improvements to LLMs, and
evaluate the results on three knowledge-intensive
tasks.
4.1
RAG Augmentation Techniques

RAG
0.876
0.821
0.836
0.294
0.685
0.895
0.530
0.281
-
-
RAG-sft
0.878
0.777
0.750
0.252
0.717
0.833
0.720
0.491
-
-
CoT
0.923
0.555
0.741
0.367
0.263
0.826
0.574
0.439
0.477
0.705
CoT-sft
0.795
0.793
0.749
0.386
0.749
0.839
0.620
0.458
0.631
0.853
Llama-3 8B
Baseline
0.722
-
-
0.200
-
-
0.560
0.366
-
-
RAG
0.828
0.783
0.746
0.285
0.610
0.861
0.556
0.398
-
-
RAG-sft
0.916
0.704
0.714
0.291
0.653
0.854
0.770
0.537
-
-
CoT
0.896
0.518
0.764
0.395
0.536
0.730
0.684
0.480
0.378
0.732
CoT-sft
0.851
0.808
0.697

able at https://git

In [7]:
from src.utils.prompts import doc_grader_prompt, doc_grader_instructions


doc_grader_prompt_formatted = doc_grader_prompt.format(
    document=doc_text, question=question
)
print(doc_grader_prompt_formatted)


Here is the document: butions and any other form of processing needed
for a given evaluation.
See listing 4 for a configuration example; it con-
tains an answer processor that extracts an answer
from an output, and a list of metrics to run.
4
Experiments: RAG Tuning
To illustrate the usage and usefulness of the
RAG FOUNDRY library, we experiment with sev-
eral possible RAG improvements to LLMs, and
evaluate the results on three knowledge-intensive
tasks.
4.1
RAG Augmentation Techniques

RAG
0.876
0.821
0.836
0.294
0.685
0.895
0.530
0.281
-
-
RAG-sft
0.878
0.777
0.750
0.252
0.717
0.833
0.720
0.491
-
-
CoT
0.923
0.555
0.741
0.367
0.263
0.826
0.574
0.439
0.477
0.705
CoT-sft
0.795
0.793
0.749
0.386
0.749
0.839
0.620
0.458
0.631
0.853
Llama-3 8B
Baseline
0.722
-
-
0.200
-
-
0.560
0.366
-
-
RAG
0.828
0.783
0.746
0.285
0.610
0.861
0.556
0.398
-
-
RAG-sft
0.916
0.704
0.714
0.291
0.653
0.854
0.770
0.537
-
-
CoT
0.896
0.518
0.764
0.395
0.536
0.730
0.684
0.480
0.378
0.732
CoT-sft
0.851
0.808
0.6

In [8]:
result = llm_json.invoke(
    [SystemMessage(content=doc_grader_instructions)]
    + [HumanMessage(content=doc_grader_prompt_formatted)]
)

print(json.loads(result.content))

{'binary_answer': 'yes'}


In [9]:
from src.utils.prompts import rag_instructions

question = "What is rag?"
docs = retriever.invoke(question)

rag_prompt = rag_instructions.format(context=format_docs(docs), question=question)

result = llm.invoke([HumanMessage(content=rag_prompt)])

print(result.content)

RAG stands for Retrieval-Augmented Generation, a natural language processing technique that enhances text generation by integrating information retrieved from a large corpus of documents. It produces accurate and contextually relevant outputs with augmented external knowledge. RAG was first introduced in 2022 and has been widely used in various scenarios to improve the quality of generated texts.


In [10]:
from src.utils.prompts import hallucination_grader_instructions, hallucination_grader_prompt

hallucination_grader_prompt_formatted = hallucination_grader_prompt.format(
    documents=format_docs(docs), answer=result.content
)

result = llm_json.invoke(
    [SystemMessage(content=hallucination_grader_instructions)]
    + [HumanMessage(content=hallucination_grader_prompt_formatted)]
)

json.loads(result.content)

{'binary_answer': 'YES',
 'explanation': 'The student answer is grounded in facts as it correctly states that RAG stands for Retrieval-Augmented Generation, a natural language processing technique that enhances text generation by integrating information retrieved from a large corpus of documents. The statement also mentions that RAG produces accurate and contextually relevant outputs with augmented external knowledge. However, the student answer lacks specific details about the introduction year (it was actually introduced in 2022) and does not provide any references to the original sources mentioned in the text.'}

In [11]:
from src.utils.prompts import answer_grader_instructions, answer_grader_prompt

question = "what is rag?"
answer = """
RAG stands for Retrieval-Augmented Generation, a machine learning approach that combines retrieval and generation techniques, primarily used in natural language processing (NLP). 
It is particularly effective for tasks like question answering, summarization, and generating responses based on external knowledge.
"""

answer_grader_prompt_formatted = answer_grader_prompt.format(
    question=question, generated_response=answer
)

result = llm_json.invoke(
    [SystemMessage(content=answer_grader_instructions)]
    + [HumanMessage(content=answer_grader_prompt_formatted)]
)

json.loads(result.content)

{'binary_answer': 'yes',
 'explanation': 'The Student Answer meets all the criteria because it provides a clear and concise definition of RAG, explaining its purpose and application in natural language processing. The answer also addresses the question directly, providing relevant information about what RAG stands for.'}