# RAGAS Evals

In [1]:
import os
from getpass import getpass
os.environ["OPENAI_API_KEY"] = getpass("Please enter your OpenAI API key!")

In [2]:
from langchain_community.document_loaders import YoutubeLoader

# Load transcript from video
video_url = "https://www.youtube.com/watch?v=BaTjJJsz0rY"
loader = YoutubeLoader.from_youtube_url(video_url)
docs = loader.load()

print(f"Loaded {len(docs)} document(s)")
print(f"First document preview: {docs[0].page_content[:200]}...")


Loaded 1 document(s)
First document preview: okay whiz we're talking about reasoning in latent space today is that the same as test time compute yeah that's right nice nice okay and we've got two big ideas to cover that are aimed at scaling the ...


In [3]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

Generating personas: 100%|██████████| 1/1 [00:00<00:00,  1.50it/s]                                           
Generating Scenarios: 100%|██████████| 2/2 [00:03<00:00,  1.94s/it]
Generating Samples: 100%|██████████| 10/10 [00:04<00:00,  2.44it/s]


In [5]:
dataset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What are the key concepts related to GPT in th...,[okay whiz we're talking about reasoning in la...,In the context of reasoning in continuous late...,single_hop_specifc_query_synthesizer
1,What is the scaling hypothesis in the context ...,[kind of doing this compression okay we're tak...,The scaling hypothesis suggests that going tok...,single_hop_specifc_query_synthesizer
2,How did artificial neural networks evolve in t...,[like when you when you when you're trying to ...,Artificial neural networks began with the mult...,single_hop_specifc_query_synthesizer
3,"As an AI Research Scientist, how do you forese...",[let's let's you know these breakthroughs that...,"By 2025, advancements in reasoning techniques ...",single_hop_specifc_query_synthesizer
4,What is the role of the Transformer in reasoni...,[hidden state of the llm as a representation o...,The Transformer plays a crucial role in reason...,single_hop_specifc_query_synthesizer
5,What are the implications of reasoning in late...,[<1-hop>\n\nhidden state of the llm as a repre...,The implications of reasoning in latent space ...,multi_hop_specific_query_synthesizer
6,What are the implications of using latent spac...,[<1-hop>\n\nhidden state of the llm as a repre...,The implications of using latent space reasoni...,multi_hop_specific_query_synthesizer
7,What are the key differences between the recur...,[<1-hop>\n\nright where the green shared recur...,The key differences between the recurrent dept...,multi_hop_specific_query_synthesizer
8,How do the concepts of reasoning in latent spa...,[<1-hop>\n\nlike when you when you when you're...,The concepts of reasoning in latent space and ...,multi_hop_specific_query_synthesizer
9,How does the recurrent depth approach relate t...,[<1-hop>\n\nright where the green shared recur...,"The recurrent depth approach, while not direct...",multi_hop_specific_query_synthesizer


In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(docs)
len(split_documents)

66

In [7]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [8]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

client = QdrantClient(":memory:")

client.create_collection(
    collection_name="latent_space_youtube",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="latent_space_youtube",
    embedding=embeddings,
)

In [9]:
_ = vector_store.add_documents(documents=split_documents)

In [10]:
retriever = vector_store.as_retriever(search_kwargs={"k": 5})

In [11]:
def retrieve(state):
  retrieved_docs = retriever.invoke(state["question"])
  return {"context" : retrieved_docs}

In [12]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Question: {question}

        Context: {context}

        Provide a clear, insightful answer using only the provided context.
        If you cannot answer from the context, say "Insufficient context to answer."
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [13]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [14]:
def generate(state):
  docs_content = "\n\n".join(doc.page_content for doc in state["context"])
  messages = rag_prompt.format_messages(question=state["question"], context=docs_content)
  response = llm.invoke(messages)
  return {"response" : response.content}

In [15]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

class State(TypedDict):
  question: str
  context: List[Document]
  response: str

In [16]:
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [17]:
for test_row in dataset:
  response = graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [18]:
dataset.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,synthesizer_name
0,What are the key concepts related to GPT in th...,[it's kind of funny in a logical way if you lo...,[okay whiz we're talking about reasoning in la...,The key concepts related to GPT in the context...,In the context of reasoning in continuous late...,single_hop_specifc_query_synthesizer
1,What is the scaling hypothesis in the context ...,[that we might have there are many different s...,[kind of doing this compression okay we're tak...,The scaling hypothesis in the context of token...,The scaling hypothesis suggests that going tok...,single_hop_specifc_query_synthesizer
2,How did artificial neural networks evolve in t...,[perceptron artificial neural networks and sin...,[like when you when you when you're trying to ...,Artificial neural networks have evolved signif...,Artificial neural networks began with the mult...,single_hop_specifc_query_synthesizer
3,"As an AI Research Scientist, how do you forese...",[the next step in the evolution of what we've ...,[let's let's you know these breakthroughs that...,"By 2025, advancements in reasoning techniques ...","By 2025, advancements in reasoning techniques ...",single_hop_specifc_query_synthesizer
4,What is the role of the Transformer in reasoni...,[really is doing at a calculation level at tes...,[hidden state of the llm as a representation o...,The role of the Transformer in reasoning model...,The Transformer plays a crucial role in reason...,single_hop_specifc_query_synthesizer
5,What are the implications of reasoning in late...,[okay whiz we're talking about reasoning in la...,[<1-hop>\n\nhidden state of the llm as a repre...,Reasoning in latent space presents significant...,The implications of reasoning in latent space ...,multi_hop_specific_query_synthesizer
6,What are the implications of using latent spac...,[it's kind of funny in a logical way if you lo...,[<1-hop>\n\nhidden state of the llm as a repre...,The implications of using latent space reasoni...,The implications of using latent space reasoni...,multi_hop_specific_query_synthesizer
7,What are the key differences between the recur...,[thanks whiz all right guys we are gonna rock ...,[<1-hop>\n\nright where the green shared recur...,The key differences between the recurrent dept...,The key differences between the recurrent dept...,multi_hop_specific_query_synthesizer
8,How do the concepts of reasoning in latent spa...,[really is doing at a calculation level at tes...,[<1-hop>\n\nlike when you when you when you're...,The concepts of reasoning in latent space and ...,The concepts of reasoning in latent space and ...,multi_hop_specific_query_synthesizer
9,How does the recurrent depth approach relate t...,[it's pretty dope not just because a bunch of ...,[<1-hop>\n\nright where the green shared recur...,The recurrent depth approach relates to Meta A...,"The recurrent depth approach, while not direct...",multi_hop_specific_query_synthesizer


In [19]:
from ragas import EvaluationDataset

evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

In [20]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

In [21]:
from ragas.metrics import LLMContextRecall, ContextPrecision, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), ContextPrecision(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
result

Evaluating: 100%|██████████| 60/60 [05:20<00:00,  5.34s/it]


{'context_recall': 0.8750, 'faithfulness': 0.8227, 'factual_correctness': 0.4340, 'answer_relevancy': 0.9860, 'context_entity_recall': 0.2497, 'noise_sensitivity_relevant': 0.3337}

# Embedding Fine Tuning

In [22]:
import uuid

id_set = set()

for document in split_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [23]:
len(split_documents)

66

In [24]:
training_split_documents = split_documents[:len(split_documents) - 24]
val_split_documents = split_documents[len(split_documents) - 24:102-12]
test_split_documents = split_documents[102-12:]

In [25]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [26]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [27]:
question_generation_chain = qa_prompt_template | qa_chat_model

In [28]:
import tqdm

async def create_questions(documents, n_questions):
  questions = {}
  relevant_docs = {}

  # Generate questions for each document
  for doc in tqdm.tqdm(documents, desc="Processing documents"):
    doc_id = doc.metadata["id"]

    # Generate n questions for this document
    for _ in range(n_questions):
      # Generate a question using the chain
      response = await question_generation_chain.ainvoke({"context": doc.page_content, "n_questions": n_questions})
      question = response.content

      # Generate unique ID for this question
      question_id = str(uuid.uuid4())

      # Store question and relevant doc mapping
      questions[question_id] = question
      relevant_docs[question_id] = [doc_id]

  return questions, relevant_docs

In [29]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)

Processing documents: 100%|██████████| 42/42 [01:15<00:00,  1.81s/it]


In [30]:
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)

Processing documents: 100%|██████████| 24/24 [00:42<00:00,  1.76s/it]


In [31]:
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)

Processing documents: 0it [00:00, ?it/s]


In [32]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [33]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [34]:
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

In [35]:
from sentence_transformers import SentenceTransformer

model_id = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
model = SentenceTransformer(model_id)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  5.29it/s]


In [36]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [37]:
BATCH_SIZE = 10

In [38]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [39]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [40]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [41]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [42]:
EPOCHS = 10

In [43]:
import wandb
wandb.init(mode="disabled")

In [None]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic_ft',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50
)

