In [None]:
from utils.web_scraper import ai_crawler
import textwrap
from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import requests
import html2text
embedding_model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
tokenizer = AutoTokenizer.from_pretrained("Intel/dynamic_tinybert")
qna_model = AutoModelForQuestionAnswering.from_pretrained("Intel/dynamic_tinybert")


  from .autonotebook import tqdm as notebook_tqdm


Invalid model-index. Not loading eval results into CardData.


# Tokenization (Not super important, but useful to understand embeddings)

How a document is split into subunits.

* Character Tokens
* Word Tokens
* Sentence Tokens

### GPT
* [Byte-Pair Encoding Tokens](https://towardsdatascience.com/byte-pair-encoding-for-beginners-708d4472c0c7/)

![Alt text for the image](https://towardsdatascience.com/wp-content/uploads/2023/10/1tQx4iDNDvME61PGO3t_qAw-768x540.png) 


# Text Embedding

An embedding is a list of numbers that represents the meaning of text, so similar things have similar numbers.

Far apart (different meaning): "king" and "banana" → vectors are far apart because they have unrelated meanings.

Close together (similar meaning): "king" and "queen" → vectors are near each other because both are royalty.
Close together (similar meaning): "man" and "woman" → vectors are near each other because both describe a gender.

Cool Fact: 
embedding("king") - embedding("man") + embedding("woman") ≈ embedding("queen")

![3D Representation of word embeddings](https://ai.engin.umich.edu/wp-content/uploads/sites/8/2020/06/king-queen.png)


# In this Workshop, we will be using sentence embeddings

In [2]:
# The queries and documents to embed
queries = [
    "What is the capital of China?",
    "Explain gravity",
]

context = "The capital of China is Beijing. Gravity is a force that attracts two bodies towards each other and it gives weight to physical objects and is responsible for the movement of planets around the sun."


documents = [sentence.strip() for sentence in context.split(".")[:2]]
documents


['The capital of China is Beijing',
 'Gravity is a force that attracts two bodies towards each other and it gives weight to physical objects and is responsible for the movement of planets around the sun']

In [3]:

# embedding_model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B") # Commented because we already loaded it above

query_embeddings = embedding_model.encode(queries, prompt_name="query")
document_embeddings = embedding_model.encode(documents)

# Compute the (cosine) similarity between the query and document embeddings
similarity = embedding_model.similarity(query_embeddings, document_embeddings)

# Print the similarity scores
print(similarity)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tensor([[0.7527, 0.1588],
        [0.1783, 0.5727]])


In [28]:
query_embeddings.shape

(2, 1024)

# Data Wrangling

In [None]:
# URL example
# Test with a simple URL that should work
# document = ai_crawler(
#     start_url="https://httpbin.org/html",  # Simple test HTML page
#     num_workers=1,
#     num_levels_deep=0  # Only crawl the starting page
# )

## simple version
response = requests.get("https://httpbin.org/html")
text = html2text.html2text(response.text)
print(text)

# Herman Melville - Moby-Dick

Availing himself of the mild, summer-cool weather that now reigned in these
latitudes, and in preparation for the peculiarly active pursuits shortly to be
anticipated, Perth, the begrimed, blistered old blacksmith, had not removed
his portable forge to the hold again, after concluding his contributory work
for Ahab's leg, but still retained it on deck, fast lashed to ringbolts by the
foremast; being now almost incessantly invoked by the headsmen, and
harpooneers, and bowsmen to do some little job for them; altering, or
repairing, or new shaping their various weapons and boat furniture. Often he
would be surrounded by an eager circle, all waiting to be served; holding
boat-spades, pike-heads, harpoons, and lances, and jealously watching his
every sooty movement, as he toiled. Nevertheless, this old man's was a patient
hammer wielded by a patient arm. No murmur, no impatience, no petulance did
come from him. Silent, slow, and solemn; bowing over still furth

# Data Pre-processing (Optional)

Some data may include:
* tables
* images
* html
* grabled characters (EX: â€™)
* lists

Which may all require their own special formatter.
Here are some tools that are freely available to help:
* camelot -- parses tables
* datalab-to/marker -- another table parser
* pdfminer
* pymupdf 
* BeautifulSoup
* Scrapy

# Naive Retrieval Augmented Generation

### Indexing (Chunk Size)

How should I split my text in to chunks?

Note: Also for the sake of simplicity, we are not tokenizing the text.
In practice, you would want to tokenize them for better results to avoid splitting words in half
But also keep in mind, since we are using a sentence embedder, it would make more sense to split the words by sentences.

In [29]:
# Chunk size
X = 200  # characters per chunk

# Split into chunks
chunks = [text[i:i+X] for i in range(0, len(text), X)]

# Print results
for idx, chunk in enumerate(chunks, start=1):
    print(f"Chunk {idx}: {chunk}")

## Look at the chunks -- the text is split in the middle of a word

Chunk 1: # Herman Melville - Moby-Dick

Availing himself of the mild, summer-cool weather that now reigned in these
latitudes, and in preparation for the peculiarly active pursuits shortly to be
anticipated, P
Chunk 2: erth, the begrimed, blistered old blacksmith, had not removed
his portable forge to the hold again, after concluding his contributory work
for Ahab's leg, but still retained it on deck, fast lashed to
Chunk 3:  ringbolts by the
foremast; being now almost incessantly invoked by the headsmen, and
harpooneers, and bowsmen to do some little job for them; altering, or
repairing, or new shaping their various weap
Chunk 4: ons and boat furniture. Often he
would be surrounded by an eager circle, all waiting to be served; holding
boat-spades, pike-heads, harpoons, and lances, and jealously watching his
every sooty movemen
Chunk 5: t, as he toiled. Nevertheless, this old man's was a patient
hammer wielded by a patient arm. No murmur, no impatience, no petulance did
come from him. 

### Retrieving the chunks

How should I retrieved those chunks?

In [30]:
# Questions:
# What work was Perth doing on deck, and why was his forge not stored in the hold?
# What physical impairment did Perth have, and how did it occur?
# What personal tragedy led to the ruin of Perth’s home and family life?
# How is Perth’s hammering described in relation to his life and heart?
# What does the narrator suggest would have been a kinder fate for Perth before his ruin?

query = "What physical impairment did Perth have, and how did it occur?"
# query = "What work was Perth doing on deck, and why was his forge not stored in the hold?"
# qs hammering described in relation to his life and heart?"
# query = "What does theuery = "What personal tragedy led to the ruin of Perth’s home and family life?"
# query = "How is Perth’ narrator suggest would have been a kinder fate for Perth before his ruin?"

chunk_embeddings = embedding_model.encode(chunks, prompt_name="query")
query_embedding = embedding_model.encode(query, prompt_name="query")

# Compute the (cosine) similarity between the query and document embeddings
similarity = embedding_model.similarity(query_embedding, chunk_embeddings)

# Print the similarity scores
print(similarity)

# Print Index of Highest Similarity Score
print(f"\n\n")

# Print Highest Similarity Score
print(f"Highest Similarity Score: {similarity.max():.2f}")

print(f"\n")

# Print Chunk with Highest Similarity Score
highest_rated_chunk = chunks[similarity.argmax()]
print(textwrap.fill(highest_rated_chunk, width=80))

# Print Index of Lowest Similarity Score
print(f"\n\n")

# Print Lowest Similarity Score
print(f"Lowest Similarity Score: {similarity.min():.2f}")

print(f"\n")

# Print Chunk with Lowest Similarity Score
lowest_rated_chunk = chunks[similarity.argmin()]
print(textwrap.fill(lowest_rated_chunk, width=80))


tensor([[0.0821, 0.1290, 0.0224, 0.1546, 0.1770, 0.2171, 0.2661, 0.0638, 0.3043,
         0.0438, 0.1750, 0.0795, 0.0501, 0.1321, 0.1390, 0.1799, 0.0613, 0.0805]])



Highest Similarity Score: 0.30


wo country towns, the blacksmith half-stupidly felt the deadly numbness stealing
over him, and sought refuge in a leaning, dilapidated barn. The issue was, the
loss of the extremities of both feet. Ou



Lowest Similarity Score: 0.02


 ringbolts by the foremast; being now almost incessantly invoked by the
headsmen, and harpooneers, and bowsmen to do some little job for them; altering,
or repairing, or new shaping their various weap


### Generating the answer

How should I generate the answer?

In [31]:
highest_rated_chunk_tokens = tokenizer.encode_plus(query, highest_rated_chunk, return_tensors="pt", truncation=True)
lowest_rated_chunk_tokens = tokenizer.encode_plus(query, lowest_rated_chunk, return_tensors="pt", truncation=True)

# Highest rated chunk
highest_rated_chunk_input_ids = highest_rated_chunk_tokens["input_ids"]
highest_rated_chunk_attention_mask = highest_rated_chunk_tokens["attention_mask"]

# Perform question answering
outputs = qna_model(highest_rated_chunk_input_ids, attention_mask=highest_rated_chunk_attention_mask)
start_scores = outputs.start_logits
end_scores = outputs.end_logits

# Find the start and end positions of the answer
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores) + 1
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(highest_rated_chunk_input_ids[0][answer_start:answer_end]))
print(f"Answer: {answer}")

# Lowest rated chunk
lowest_rated_chunk_input_ids = lowest_rated_chunk_tokens["input_ids"]
lowest_rated_chunk_attention_mask = lowest_rated_chunk_tokens["attention_mask"]

# Perform question answering
outputs = qna_model(lowest_rated_chunk_input_ids, attention_mask=lowest_rated_chunk_attention_mask)
## Note here that we are using a qna model to generate the answer -- which is trained on question-answer pairs
## General-purpose models may need additonal prompting to generate the answer:
## "You are a helpful assistant, answer the question based on the context provided"

start_scores = outputs.start_logits
end_scores = outputs.end_logits

# Find the start and end positions of the answer
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores) + 1
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(lowest_rated_chunk_input_ids[0][answer_start:answer_end]))
print(f"Answer: {answer}")

Answer: the loss of the extremities
Answer: altering, or repairing


# GPT-Generated Answers from the text for confirmation

1. What physical impairment did Perth have, and how did it occur?

Perth lost the extremities of both feet.

This occurred one bitter winter's midnight, when he sought refuge in a dilapidated barn and felt deadly numbness stealing over him, ultimately leading to the loss of his feet.

2. What work was Perth doing on deck, and why was his forge not stored in the hold?

Perth was altering, repairing, or reshaping weapons and boat furniture for the headsmen, harpooneers, and bowsmen.

His portable forge was kept on deck, fastened to ringbolts by the foremast, because he was almost constantly asked to do small jobs and needed it readily accessible.

3. What personal tragedy led to the ruin of Perth’s home and family life?

A burglar, the Bottle Conjuror, broke into his home under cover of darkness.

Perth unknowingly led the burglar into his family's home, resulting in the loss of everything he owned—his house, garden, and possessions—leaving his family destitute.

4. How is Perth’s hammering described in relation to his life and heart?

Perth’s hammering is described as patient, slow, solemn, and almost lifelike, with the heavy beating of his hammer likened to the heavy beating of his heart.

It conveys his dedication, endurance, and the sense that toil itself is the essence of his life.

5. What does the narrator suggest would have been a kinder fate for Perth before his ruin?

The narrator suggests that Death taking Perth before his full ruin would have been kinder, so that his young widow and orphans would have experienced grief without total destitution and retained the memory of a “venerable, legendary sire” rather than enduring complete loss.

# LLM alternative

If your lab has access to OpenAI, here is the code for generation

In [10]:
# from openai import OpenAI

# # Initialize client
# client = OpenAI(api_key="YOUR_API_KEY")

# # Your query and chunks
# query = "What physical impairment did Perth have, and how did it occur?"

# def get_llm_answer(question, context):
#     prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
#     response = client.chat.completions.create(
#         model="gpt-4o-mini",
#         messages=[{"role": "user", "content": prompt}],
#         temperature=0  # deterministic answers
#     )
#     answer = response.choices[0].message.content.strip()
#     return answer

# # Highest rated chunk
# highest_answer = get_llm_answer(query, highest_rated_chunk)
# print(f"Answer from highest rated chunk: {highest_answer}")

# # Lowest rated chunk
# lowest_answer = get_llm_answer(query, lowest_rated_chunk)
# print(f"Answer from lowest rated chunk: {lowest_answer}")

In [11]:
def generate_answer(question, context):
    chunk_tokens = tokenizer.encode_plus(question, context, return_tensors="pt", truncation=True)
    chunk_input_ids = chunk_tokens["input_ids"]
    chunk_attention_mask = chunk_tokens["attention_mask"]
    outputs = qna_model(chunk_input_ids, attention_mask=chunk_attention_mask)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores) + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(chunk_input_ids[0][answer_start:answer_end]))
    return answer
    

In [12]:
import chromadb
from chromadb.config import Settings


In [13]:
client = chromadb.Client(Settings(persist_directory=".chromadb"))

collection = client.create_collection(name="my_qa_collection")

collection.add(
    documents=chunks,
    metadatas=[{"source": f"chunk{i}"} for i in range(len(chunks))],
    ids=[str(i) for i in range(len(chunks))],
    embeddings=chunk_embeddings
)


In [14]:

query = "What work was Perth doing on deck, and why was his forge not stored in the hold?"
# query = "What physical impairment did Perth have, and how did it occur?"
# query = "What personal tragedy led to the ruin of Perth’s home and family life?"
# query = "How is Perth’s hammering described in relation to his life and heart?"
# query = "What does the narrator suggest would have been a kinder fate for Perth before his ruin?"



query_embedding = embedding_model.encode(query, prompt_name="query")
results = collection.query(query_embeddings=[query_embedding], n_results=1)

print("Top match document:", results['documents'][0][0])
print("Top match metadata:", results['metadatas'][0][0])

generate_answer(query, results['documents'][0][0])

Top match document: erth, the begrimed, blistered old blacksmith, had not removed his portable forge to the hold again, after concluding his contributory work for Ahab's leg, but still retained it on deck, fast lashed to
Top match metadata: {'source': 'chunk1'}


'contributory work'

# Evaluation

How do I know if my RAG is performing correctly?



### Option 1 (best case scenario, labeled dataset)

You have a labeled dataset, for example [FAQ](https://faq.caesarstone.co.uk/en/care-maintenance).


In [15]:
qna_data = {
  "questions_and_answers": [
    {
      "question": "What work was Perth doing on deck, and why was his forge not stored in the hold?",
      "answer": "Perth was altering, repairing, or reshaping weapons and boat furniture for the headsmen, harpooneers, and bowsmen. His portable forge was kept on deck, fastened to ringbolts by the foremast, because he was almost constantly asked to do small jobs and needed it readily accessible."
    },
    {
      "question": "What physical impairment did Perth have, and how did it occur?",
      "answer": "Perth lost the extremities of both feet. This occurred one bitter winter's midnight, when he sought refuge in a dilapidated barn and felt deadly numbness stealing over him, ultimately leading to the loss of his feet."
    },
    {
      "question": "What personal tragedy led to the ruin of Perth’s home and family life?",
      "answer": "A burglar, the Bottle Conjuror, broke into his home under cover of darkness. Perth unknowingly led the burglar into his family's home, resulting in the loss of everything he owned—his house, garden, and possessions—leaving his family destitute."
    },
    {
      "question": "How is Perth’s hammering described in relation to his life and heart?",
      "answer": "Perth’s hammering is described as patient, slow, solemn, and almost lifelike, with the heavy beating of his hammer likened to the heavy beating of his heart. It conveys his dedication, endurance, and the sense that toil itself is the essence of his life."
    },
    {
      "question": "What does the narrator suggest would have been a kinder fate for Perth before his ruin?",
      "answer": "The narrator suggests that Death taking Perth before his full ruin would have been kinder, so that his young widow and orphans would have experienced grief without total destitution and retained the memory of a “venerable, legendary sire” rather than enduring complete loss."
    }
  ]
}


for chunk, qna in zip(chunks, qna_data["questions_and_answers"]):
  question_embedding = embedding_model.encode(qna["question"], prompt_name="query")
  retrieved_chunk = collection.query(query_embeddings=[question_embedding], n_results=1)
  answer = generate_answer(qna["question"], retrieved_chunk['documents'][0][0])
  print(f"Question: {qna['question']}")
  print(f"Answer: {qna['answer']}")
  print(f"Generated Answer: {answer}")
  print(f"Retrieved Chunk: {retrieved_chunk['documents'][0][0]}")
  print("\n")




Question: What work was Perth doing on deck, and why was his forge not stored in the hold?
Answer: Perth was altering, repairing, or reshaping weapons and boat furniture for the headsmen, harpooneers, and bowsmen. His portable forge was kept on deck, fastened to ringbolts by the foremast, because he was almost constantly asked to do small jobs and needed it readily accessible.
Generated Answer: contributory work
Retrieved Chunk: erth, the begrimed, blistered old blacksmith, had not removed his portable forge to the hold again, after concluding his contributory work for Ahab's leg, but still retained it on deck, fast lashed to


Question: What physical impairment did Perth have, and how did it occur?
Answer: Perth lost the extremities of both feet. This occurred one bitter winter's midnight, when he sought refuge in a dilapidated barn and felt deadly numbness stealing over him, ultimately leading to the loss of his feet.
Generated Answer: the loss of the extremities
Retrieved Chunk: wo 

# More Evaluations

### Option 2 (LLMs as a judge)

"Our results reveal that strong LLM judges like GPT-4 can match both controlled and crowdsourced human preferences well, achieving over 80% agreement, the same level of agreement between humans."

(Zheng et al., 2023; [Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena](https://proceedings.neurips.cc/paper_files/paper/2023/file/91f18a1287b398d378ef22505bf41832-Paper-Datasets_and_Benchmarks.pdf); >5000 citations) -- makers of Chatbot Arena

[Trulens](https://www.snowflake.com/en/engineering-blog/benchmarking-LLM-as-a-judge-RAG-triad-metrics/)

```python
prompt: """
You are a Biologist grader; providing the reasonablness of a given RESPONSE to a give PROMPT.

Answer Criteria:
{context} ## Where you may or may not have a RAG system to annotate your data
"""
```

### Option 3 (Synthetic dataset)

Generate your own synthetic dataset

```python
queries = [...,...,...]
chunks = [...,...,...]
evaluations = []

for query, chunk in zip(queries, chunks):
    result = generate_answer(query, chunk)
    evaluations.append({"question": query, "answer": result, "context": chunk})

{
    question: "...",
    answer: "...",
    context: "..."
}
```

In [16]:
#Print text
print(textwrap.fill(text, width=80) )


# Herman Melville - Moby-Dick  Availing himself of the mild, summer-cool weather
that now reigned in these latitudes, and in preparation for the peculiarly
active pursuits shortly to be anticipated, Perth, the begrimed, blistered old
blacksmith, had not removed his portable forge to the hold again, after
concluding his contributory work for Ahab's leg, but still retained it on deck,
fast lashed to ringbolts by the foremast; being now almost incessantly invoked
by the headsmen, and harpooneers, and bowsmen to do some little job for them;
altering, or repairing, or new shaping their various weapons and boat furniture.
Often he would be surrounded by an eager circle, all waiting to be served;
holding boat-spades, pike-heads, harpoons, and lances, and jealously watching
his every sooty movement, as he toiled. Nevertheless, this old man's was a
patient hammer wielded by a patient arm. No murmur, no impatience, no petulance
did come from him. Silent, slow, and solemn; bowing over still furth

In [17]:
# import ollama

# response = ollama.generate(model='gemma:2b', prompt='what is a qubit')

# print(response['response'])

In [None]:
import textwrap
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import chromadb
from chromadb.config import Settings
import numpy as np
from numpy import dot
from numpy.linalg import norm
import utils.get_data


class QADocumentRetriever:
    def __init__(self, documents, chunk_size=200, overlap=0, embedding_model=None, tokenizer=None, llm=None):
        self.documents = documents
        self.chunk_size = chunk_size
        self.overlap = overlap
        if self.overlap >= self.chunk_size:
            raise ValueError("Overlap must be less than chunk size")
        
        # Chunk documents
        self.chunks = self._chunk_documents(documents)
        
        # Initialize embedding model
        self.embedding_model = embedding_model
        self.chunk_embeddings = self.embedding_model.encode(self.chunks, batch_size=2, show_progress_bar=True, convert_to_numpy=True).astype(np.float32)
        
        # Initialize QA model
        self.tokenizer = tokenizer
        self.qna_model = llm
        
        # Initialize ChromaDB client and collection
        self.client = chromadb.Client(Settings(persist_directory=".chromadb"))
        existing_collections = [c.name for c in self.client.list_collections()]
        if "qa_collection" in existing_collections:
            self.client.delete_collection(name="qa_collection")
        self.collection = self.client.create_collection(name="qa_collection")
        self.collection.add(
            documents=self.chunks,
            metadatas=[{"source": f"chunk{i}"} for i in range(len(self.chunks))],
            ids=[str(i) for i in range(len(self.chunks))],
            embeddings=self.chunk_embeddings
        )
        
    def _chunk_documents(self, documents):
        chunks = []
        for doc in documents:
            words = doc.split()
            for i in range(0, len(words), self.chunk_size - self.overlap):
                chunk = " ".join(words[i:i+self.chunk_size])
                if chunk:  # avoid empty chunks
                    chunks.append(chunk)
        return chunks

    def generate_answer(self, question, context):
        chunk_tokens = self.tokenizer.encode_plus(question, context, return_tensors="pt", truncation=True)
        chunk_input_ids = chunk_tokens["input_ids"]
        chunk_attention_mask = chunk_tokens["attention_mask"]
        outputs = self.qna_model(chunk_input_ids, attention_mask=chunk_attention_mask)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits
        answer_start = torch.argmax(start_scores)
        answer_end = torch.argmax(end_scores) + 1
        answer = self.tokenizer.convert_tokens_to_string(
            self.tokenizer.convert_ids_to_tokens(chunk_input_ids[0][answer_start:answer_end])
        )
        return answer

    def retrieve_and_answer(self, question, n_results=1):
        # Get embedding for the question
        question_embedding = self.embedding_model.encode(question, convert_to_numpy=True)
        
        # Retrieve top n_results chunks
        retrieved = self.collection.query(query_embeddings=[question_embedding], n_results=n_results)
        retrieved_chunks = retrieved['documents'][0]
        
        # Concatenate all retrieved chunks into a single context
        full_context = "\n".join(retrieved_chunks)
        
        # Generate answer using the entire context
        answer = self.generate_answer(question, full_context)
        
        return [{"chunk": full_context, "answer": answer}]

    def evaluate_similarity(self, question):
        question_embedding = self.embedding_model.encode(question, convert_to_numpy=True)
        similarities = [dot(question_embedding, chunk_emb)/ (norm(question_embedding) * norm(chunk_emb))
                        for chunk_emb in self.chunk_embeddings]
        return similarities
    
    def evaluate_qna_set(self, qna_data, n_results=1, print_results=True):
        """
        Evaluate a list of questions and reference answers.
        Returns a list of dicts with question, reference answer, generated answer,
        retrieved chunk, and similarity score.
        """
        evaluation_results = []

        for item in qna_data['questions_and_answers']:
            question = item["question"]
            reference_answer = item["answer"]

            # Retrieve document(s) and generate answer
            answers = self.retrieve_and_answer(question, n_results=n_results)
            generated = answers[0]["answer"]
            retrieved_chunk = answers[0]["chunk"]

            # Compute similarity between generated answer and reference answer
            gen_embedding = self.embedding_model.encode(generated, convert_to_numpy=True)
            ref_embedding = self.embedding_model.encode(reference_answer, convert_to_numpy=True)
            similarity = float(dot(gen_embedding, ref_embedding) / (norm(gen_embedding) * norm(ref_embedding)))

            result = {
                "question": question,
                "reference_answer": reference_answer,
                "generated_answer": generated,
                "retrieved_chunk": retrieved_chunk,
                "similarity_score": similarity,
            }
            evaluation_results.append(result)

            if print_results:
                print(f"Question: {question}")
                print(f"Reference Answer: {reference_answer}")
                print(f"Generated Answer: {generated}")
                print(f"Similarity Score: {similarity:.4f}")
                print("Retrieved Chunk:\n", textwrap.fill(retrieved_chunk, width=80))
                print("\n" + "-"*80 + "\n")

        avg_similarity = sum(r["similarity_score"] for r in evaluation_results) / len(evaluation_results)
        print("Average similarity score: ", avg_similarity)

        return evaluation_results

###### QA Models #######

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

# tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
# model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

# tokenizer = AutoTokenizer.from_pretrained("twmkn9/bert-base-uncased-squad2")
# model = AutoModelForQuestionAnswering.from_pretrained("twmkn9/bert-base-uncased-squad2")

# tokenizer = AutoTokenizer.from_pretrained("ahotrod/electra_large_discriminator_squad2_512")
# model = AutoModelForQuestionAnswering.from_pretrained("ahotrod/electra_large_discriminator_squad2_512")

###### Embedding Models #######
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# embedding_model = SentenceTransformer("all-mpnet-base-v2")

# embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# embedding_model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

# embedding_model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v4")

# embedding_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

# embedding_model = SentenceTransformer("paraphrase-MiniLM-L3-v2")

# embedding_model = SentenceTransformer("all-mpnet-base-v2")


qa_retriever = QADocumentRetriever([text], chunk_size=30, overlap=10, embedding_model=embedding_model, tokenizer=tokenizer, llm=model)
results = qa_retriever.evaluate_qna_set(qna_data, n_results=2)