In [1]:
# Claude suggested I add in the system path to help find the py module. See Appendix K. 
import sys
sys.path.append('/Users/shoyou100/Visual-Code-Studio-workspace/assignment2-rag/src')
from langchain_google_genai import ChatGoogleGenerativeAI
import pandas as pd
from naive_rag import NaiveRAG  
from evaluation import Evaluation
from ragas import evaluate
from datasets import Dataset

from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)


  from .autonotebook import tqdm as notebook_tqdm


# Read Passages from the Datasets and Drop rows if they are NA or empty

In [2]:
passages = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")

print(passages.shape)
passages.head()

(3200, 1)


Unnamed: 0_level_0,passage
id,Unnamed: 1_level_1
0,"Uruguay (official full name in ; pron. , Eas..."
1,"It is bordered by Brazil to the north, by Arge..."
2,Montevideo was founded by the Spanish in the e...
3,The economy is largely based in agriculture (m...
4,"According to Transparency International, Urugu..."


In [3]:
queries = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")
print(queries.shape)
queries.head()

(918, 2)


Unnamed: 0_level_0,question,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Was Abraham Lincoln the sixteenth President of...,yes
2,Did Lincoln sign the National Banking Act of 1...,yes
4,Did his mother die of pneumonia?,no
6,How many long was Lincoln's formal education?,18 months
8,When did Lincoln begin his political career?,1832


In [4]:
naive_rag = NaiveRAG('all-MiniLM-L6-v2', 'google/flan-t5-small', 'rag_wikipedia_mini.db', 'rag_mini')

passage_embeddings = naive_rag.embedding_model.encode(passages['passage'].tolist()) 
print(naive_rag.embedding_model.get_sentence_embedding_dimension())
query_embeddings = naive_rag.embedding_model.encode(queries['question'].tolist())
print(query_embeddings)
print(naive_rag.embedding_model.get_sentence_embedding_dimension())

Embedding model loaded.
Tokenizer loaded.
Schema created.
Seq2Seq model loaded.
Milvus client connected.
Dropping existing collection 'rag_mini'...
Collection dropped.


I0000 00:00:1759542181.808160  805789 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


384
[[-0.00375673  0.03796374 -0.03182048 ... -0.04027529  0.00659006
   0.03112675]
 [-0.03231501 -0.0094521  -0.10944731 ... -0.01333902 -0.0162666
  -0.01005705]
 [ 0.00882224  0.0258874  -0.01894972 ... -0.03792767  0.07239034
   0.00896565]
 ...
 [-0.00906186 -0.04410971 -0.11035763 ... -0.03789582  0.03106767
   0.06000853]
 [ 0.02661895  0.05995833 -0.08694381 ...  0.0077695   0.01058571
  -0.01678987]
 [-0.00681225  0.05897506 -0.05983922 ... -0.01872636 -0.01558293
   0.01479083]]
384


# Create Milvus Client and Insert your Embeddings to your DB
- Make sure you define a schema for your collection (Points will be deducted if you fail to define a proper schema with ids, passage text, embedding)

In [5]:
# Redefine id_ to ensure it is available
id_ = passages.index.tolist()
passage = passages['passage'].tolist()
embedding = passage_embeddings.tolist()
naive_rag.create_dataBase(passages, 'passage', passage_embeddings, 384)

Columns defined.
Fields added to schema.
Collection created.
Data inserted successfully.
Index created successfully.
Collection loaded into memory


In [6]:
naive_rag.sanityCheck('rag_mini')

Entity count: 3200
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


# Steps to Fetch Results
- Read the Question Dataset
- Clean the Question Dataset if necessary (Drop Questions with NaN etc.)
- Convert Each Query to a Vector Embedding (Use the same embedding model you used to embed your document)
- Try for a Single Question First
- Load Collection into Memory after creating Index for Search on your embedding field (This is an essential step before you can search in your db)
- Search and Fetch Top N Results

**Develop your Prompt**

# Generate Responses for 100 queries in the Dataset

In [7]:
# Prompt Style 1 - Instruction Prompt
system_prompt = f"You are a concise and accurate assistant. Answer the question based on the provided context."
naive_rag.queries_list = []  # Reset queries_list before new searches
naive_rag.contexts_list = [] 

# Top-1 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag.search(row,1, passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)

[['yes'], ['yes'], ['no'], ['18 months'], ['1832'], ['United States Note'], ['Grace Bedell'], ['1789'], ['yes'], ['yes'], ['yes'], ['yes'], ['Springfield'], ['1861'], ['Abraham Lincoln'], ['yes'], ['Ambrose Burnside'], ['freed slaves in territories not under Union control'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['18'], ['1846'], ['1834'], ['New Salem'], ['yes'], ['yes'], ['Amedeo Avogadro'], ['a liceo'], ['1841'], ['1833'], ['yes'], ['yes'], ['no'], ['yes'], ['No, he was a king.'], ["Avogadro 's number is commonly used to compute the results of chemical reactions"], ['Tesla stated in 1925 that::'], ['no'], ['He pressed for internal improvements and increased shipbuilding and foreign trade'], ['Anders Celsius'], ['a sailor'], ['When Avogadro announced'], ['no'], ['no'], ['yes'], ['yes'], ['yes'], ['Anders Celsius'], ['named after him'], ['The Celsius crater on the Moon is named after him.'], ['0'], ['

# Finding out the Basic QA Metrics (F1 score, EM score)

In [8]:
naive_rag.calculateEM(queries.answer)

EM Score: 21/100 = 0.2100


0.21

In [9]:
# F1 Score Calculation
evaluator = Evaluation()
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

F1 Score: 0.2413030303030303


# Persona Styles 2 & 3 - Top-1

In [10]:
# Prompt Style 2 - Persona Prompt
system_prompt = f"You are a concise history teacher."

naive_rag.queries_list = []  # Reset the queries list
naive_rag.contexts_list = [] 


# Top-1 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag.search(row, 1, passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)


[['yes'], ['yes'], ['no'], ['18 months'], ['1832'], ['United States Note'], ['Grace Bedell'], ['1789'], ['yes'], ['yes'], ['yes'], ['yes'], ['Springfield'], ['1861'], ['Abraham Lincoln'], ['yes'], ['Ambrose Burnside'], ['freed slaves in territories not under Union control'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['1816'], ['1846'], ['one of the most respected and successful lawyers in Illinois and grew steadily more prosperous.'], ['New Salem'], ['yes'], ['yes'], ['Amedeo Avogadro'], ['john d. scott'], ['1841'], ['1833'], ['yes'], ['yes'], ['no'], ['yes'], ['He was a king of the Italian scientist.'], ["Avogadro 's number is commonly used to compute the results of chemical reactions"], ['Tesla stated in 1925 that::'], ['no'], ['He also wanted to enhance and improve the commercial marine'], ['Anders Celsius'], ['a sailor'], ['1896'], ['no'], ['no'], ['yes'], ['yes'], ['yes'], ['Anders Celsius'], ['named

In [11]:
# Prompt Style 2 Evaluation
naive_rag.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")


EM Score: 20/100 = 0.2000
F1 Score: 0.23759328579916816


In [12]:
# Prompt Style 3 - CoT
system_prompt = f"Read the question first. Then retrieve the context from the database. Finally, answer the question using the retrieved context."
naive_rag.queries_list = []  # Reset the queries list
naive_rag.contexts_list = [] 

# Top-1 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag.search(row, 1, passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)

[['no'], ['Yes'], ['no'], ['18 months'], ['1832'], ['the United States Note'], ['Grace Bedell'], ['1789'], ['yes'], ['no'], ['yes'], ['He believed that this would attract steamboat traffic, which would allow the sparsely populated'], ['Springfield, Illinois'], ['In 1891 Lincoln was appointed President of the United States.'], ['Abraham Lincoln'], ['yes'], ['Ambrose Burnside'], ['freed slaves in territories not under Union control'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['1816'], ['1846'], ['a reputation as a formidable adversary during cross-examinations and in his closing'], ['New Salem'], ['Yes'], ['Yes'], ['Amedeo Avogadro'], ['he dedicated himself to the study of physics and mathematics (then called positive philosophy), and'], ['1841'], ['1833'], ['yes'], ['A noble ancient family of Piedmont, Italy.'], ['No'], ['yes'], ['Yes'], ["Avogadro 's number is commonly used to compute the results of chem

In [13]:
# Prompt Style 3 Evaluation
naive_rag.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 19/100 = 0.1900
F1 Score: 0.23487421427901306


# Top-5 Retrieval with Embedding size 384 and 512

In [14]:
# Top-5 Retrieval with Generation for 100 queries (Embedding size 384)
system_prompt = f"You are a concise and accurate assistant. Answer the question based on the provided context."
naive_rag.queries_list = []  # Reset queries_list before new searches
naive_rag.contexts_list = [] 

count = 0
for row in queries.question:
   naive_rag.search(row, 5, passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)

[['yes'], ['yes'], ['no'], ['18 months'], ['1832'], ['United States Note'], ['Grace Bedell'], ['1789'], ['yes'], ['yes'], ['yes'], ['yes'], ['Springfield'], ['1861'], ['Abraham Lincoln'], ['yes'], ['Ambrose Burnside'], ['freed slaves in territories not under Union control'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['18'], ['1846'], ['1834'], ['New Salem'], ['yes'], ['yes'], ['Amedeo Avogadro'], ['a liceo'], ['1841'], ['1833'], ['yes'], ['yes'], ['no'], ['yes'], ['No, he was a king.'], ["Avogadro 's number is commonly used to compute the results of chemical reactions"], ['Tesla stated in 1925 that::'], ['no'], ['He pressed for internal improvements and increased shipbuilding and foreign trade'], ['Anders Celsius'], ['a sailor'], ['When Avogadro announced'], ['no'], ['no'], ['yes'], ['yes'], ['yes'], ['Anders Celsius'], ['named after him'], ['The Celsius crater on the Moon is named after him.'], ['0'], ['

In [15]:
# Prompt Style 1 Evaluation
naive_rag.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 21/100 = 0.2100
F1 Score: 0.23648141828501734


In [16]:
# Prompt Style 2 - Persona Prompt
system_prompt = f"You are a concise history teacher."
naive_rag.queries_list = []  # Reset queries_list before new searches
naive_rag.contexts_list = [] 

# Top-3 Retrieval with Generation for 100 queries
count = 0
for row in queries.question:
   naive_rag.search(row, 5, passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)

[['yes'], ['yes'], ['no'], ['18 months'], ['1832'], ['United States Note'], ['Grace Bedell'], ['1789'], ['yes'], ['yes'], ['yes'], ['yes'], ['Springfield'], ['1861'], ['Abraham Lincoln'], ['yes'], ['Ambrose Burnside'], ['freed slaves in territories not under Union control'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['1816'], ['1846'], ['one of the most respected and successful lawyers in Illinois and grew steadily more prosperous.'], ['New Salem'], ['yes'], ['yes'], ['Amedeo Avogadro'], ['john d. scott'], ['1841'], ['1833'], ['yes'], ['yes'], ['no'], ['yes'], ['He was a king of the Italian scientist.'], ["Avogadro 's number is commonly used to compute the results of chemical reactions"], ['Tesla stated in 1925 that::'], ['no'], ['He also wanted to enhance and improve the commercial marine'], ['Anders Celsius'], ['a sailor'], ['1896'], ['no'], ['no'], ['yes'], ['yes'], ['yes'], ['Anders Celsius'], ['named

In [17]:
# Prompt Style 2 Evaluation
naive_rag.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 20/100 = 0.2000
F1 Score: 0.23596184288707509


In [18]:
# Prompt Style 3 - CoT
system_prompt = f"Read the question first. Then retrieve the context from the database. Finally, answer the question using the retrieved context."
naive_rag.queries_list = []  # Reset the queries list
naive_rag.contexts_list = [] 

# Top-3 Retrieval with Generation for 100 queries
count = 0
for row in queries.question:
   naive_rag.search(row, 5,passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)


[['no'], ['Yes'], ['no'], ['18 months'], ['1832'], ['the United States Note'], ['Grace Bedell'], ['1789'], ['yes'], ['no'], ['yes'], ['He believed that this would attract steamboat traffic, which would allow the sparsely populated'], ['Springfield, Illinois'], ['In 1891 Lincoln was appointed President of the United States.'], ['Abraham Lincoln'], ['yes'], ['Ambrose Burnside'], ['freed slaves in territories not under Union control'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['1816'], ['1846'], ['a reputation as a formidable adversary during cross-examinations and in his closing'], ['New Salem'], ['Yes'], ['Yes'], ['Amedeo Avogadro'], ['he dedicated himself to the study of physics and mathematics (then called positive philosophy), and'], ['1841'], ['1833'], ['yes'], ['A noble ancient family of Piedmont, Italy.'], ['No'], ['yes'], ['Yes'], ["Avogadro 's number is commonly used to compute the results of chem

In [19]:
# Prompt Style 3 Evaluation
naive_rag.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 19/100 = 0.1900
F1 Score: 0.23487421427901306


In [20]:
# Top-3 Retrieval for 100 queries (Embedding size 512)
naive_rag2 = NaiveRAG('sentence-transformers/distiluse-base-multilingual-cased', 'google/flan-t5-small', 'rag_wikipedia_mini_512.db', 'rag_mini_512')
passage_embeddings2 = naive_rag2.embedding_model.encode(passages['passage'].tolist()) 
print(naive_rag2.embedding_model.get_sentence_embedding_dimension())
query_embeddings2 = naive_rag2.embedding_model.encode(queries['question'].tolist())
print(query_embeddings2)
print(naive_rag2.embedding_model.get_sentence_embedding_dimension())

Embedding model loaded.
Tokenizer loaded.
Schema created.
Seq2Seq model loaded.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Milvus client connected.
Dropping existing collection 'rag_mini_512'...
Collection dropped.
512
[[-0.01053431  0.01283608 -0.04033115 ... -0.02846795  0.05243164
  -0.01668956]
 [-0.0368097  -0.04185484 -0.0282476  ...  0.01714185  0.04274707
  -0.02702508]
 [-0.00173918 -0.04569767  0.02886216 ...  0.01394532  0.01490885
   0.03191647]
 ...
 [-0.05018368  0.01850008  0.02984808 ...  0.0090341  -0.05579572
  -0.0622946 ]
 [-0.08829523 -0.05138754  0.0374678  ... -0.00082947 -0.045458
  -0.03530771]
 [-0.06496639 -0.00691883 -0.06037014 ...  0.02557908 -0.04327128
  -0.02709984]]
512


In [21]:
# Redefine id_ to ensure it is available
id_ = passages.index.tolist()
passage = passages['passage'].tolist()
embedding = passage_embeddings2.tolist()
naive_rag2.create_dataBase(passages, 'passage', passage_embeddings2, 512)

# Entity count expected to be 6400 due to new rag_mini_512 db
naive_rag2.sanityCheck('rag_mini_512')

Columns defined.
Fields added to schema.


Collection created.
Data inserted successfully.
Index created successfully.
Collection loaded into memory
Entity count: 3200
Collection schema: {'collection_name': 'rag_mini_512', 'auto_id': False, 'num_shards': 0, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 512}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


In [22]:
# Prompt Style 1 - Instruction Prompt
system_prompt = f"You are a concise and accurate assistant. Answer the question based on the provided context."
naive_rag2.queries_list = []  # Reset queries_list before new searches
naive_rag2.contexts_list = [] 

# Top-3 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag2.search(row, 5, passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag2.queries_list)

[['yes'], ['yes'], ['no'], ['18 months'], ['February 12, 1809 â April 15, 1865'], ['The legal tender act of 1862'], ['Grace Bedell'], ['1789'], ['yes'], ['yes'], ['no'], ['yes'], ['Perry County'], ['#He was the first President to coin an internationally recognized trademark, although not deliberately.'], ['Lincoln'], ['yes'], ['Michael Korda'], ['to preserve the Union'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['16'], ['1846'], ['23-year'], ['the eight judicial district'], ['yes'], ['no'], ['Amedeo Avogadro'], ['a liceo'], ['1884'], ['During the Second World War'], ['no'], ['yes'], ['no'], ['yes'], ['No, he was a king.'], ["Avogadro 's number is commonly used to compute the results of chemical reactions"], ['Tesla stated in 1925 that::'], ['no'], ['The completion of the Aswan High Dam in 1971 and the resultant Lake Nasser have altered'], ['Anders Celsius'], ['a sailor'], ['1928'], ['no'], ['yes'], ['yes

In [23]:
# Prompt Style 1 Evaluation
naive_rag2.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag2.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 18/100 = 0.1800
F1 Score: 0.23100906678460859


In [24]:
# Prompt Style 2 - Persona Prompt
system_prompt = f"You are a concise history teacher."

naive_rag2.queries_list = []  # Reset queries_list before new searches
naive_rag2.contexts_list = [] 

# Top-3 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag2.search(row, 5, passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag2.queries_list)

[['yes'], ['yes'], ['no'], ['18 months'], ['February 12, 1809 â April 15, 1865'], ['the legal rights of the public'], ['Grace Bedell'], ['1789'], ['yes'], ['yes'], ['no'], ['in a dispute with a shareholder, James A. Barret'], ['Perry County'], ['#He was the first President to coin an internationally recognized trademark, although not deliberately.'], ['a sailor'], ['yes'], ['Michael Korda'], ['to weaken the rebellion by destroying the economic base of its leadership class'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['sixteen'], ['1846'], ['23-year'], ['the eight judicial district'], ['yes'], ['no'], ['Amedeo Avogadro'], ['john d. scott'], ['1881'], ['he was born in 1891'], ['no'], ['yes'], ['no'], ['yes'], ['He was a king of the Italian scientist.'], ["Avogadro 's number is commonly used to compute the results of chemical reactions"], ['Tesla stated in 1925 that::'], ['no'], ['The completion of the Aswan

In [25]:
# Prompt Style 2 Evaluation
naive_rag2.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag2.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 18/100 = 0.1800
F1 Score: 0.22788798394158302


In [26]:
# Prompt Style 3 - CoT
system_prompt = f"Read the question first. Then retrieve the context from the database. Finally, answer the question using the retrieved context."
naive_rag2.queries_list = []  # Reset the queries list
naive_rag2.contexts_list = [] 


# Top-3 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag2.search(row, 5, passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag2.queries_list)

[['no'], ['Yes'], ['no'], ['18 months'], ['February 12, 1809 â April 15, 1865'], ['The Legal Tender Act of 1862'], ['Grace Bedell'], ['1789'], ['yes'], ['Yes'], ['no'], ['In one prominent 1851 case, he represented the Alton & Sangamon Railroad in'], ['Perry County'], ['#He was the first President to coin an internationally recognized trademark, although not deliberately.'], [':'], ['yes'], ['Michael Korda'], ['He made it clear that the North was fighting the war to preserve the Union, not to abolish slavery'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['16'], ['1846'], ['23-year'], ['the eight judicial district'], ['Yes'], ['no'], ['Amedeo Avogadro'], ['he dedicated himself to the study of physics and mathematics (then called positive philosophy), and'], ['1881'], ['He was born in the year of his birth.'], ['no'], ['A noble ancient family of Piedmont, Italy.'], ['No'], ['yes'], ['Yes'], ["Avogadro 's numb

In [27]:
# Prompt Style 3 Evaluation
naive_rag2.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag2.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 17/100 = 0.1700
F1 Score: 0.2245609239974565


# Top 10 Retrieval with Embedding Sizes 384 and 512

In [28]:
# Top-10 Retrieval with Generation for 100 queries (Embedding size 384)
system_prompt = f"You are a concise and accurate assistant. Answer the question based on the provided context."
naive_rag.queries_list = []  # Reset queries_list before new searches
naive_rag.contexts_list = [] 

count = 0
for row in queries.question:
   naive_rag.search(row, 10, passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)

[['yes'], ['yes'], ['no'], ['18 months'], ['1832'], ['United States Note'], ['Grace Bedell'], ['1789'], ['yes'], ['yes'], ['yes'], ['yes'], ['Springfield'], ['1861'], ['Abraham Lincoln'], ['yes'], ['Ambrose Burnside'], ['freed slaves in territories not under Union control'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['18'], ['1846'], ['1834'], ['New Salem'], ['yes'], ['yes'], ['Amedeo Avogadro'], ['a liceo'], ['1841'], ['1833'], ['yes'], ['yes'], ['no'], ['yes'], ['No, he was a king.'], ["Avogadro 's number is commonly used to compute the results of chemical reactions"], ['Tesla stated in 1925 that::'], ['no'], ['He pressed for internal improvements and increased shipbuilding and foreign trade'], ['Anders Celsius'], ['a sailor'], ['When Avogadro announced'], ['no'], ['no'], ['yes'], ['yes'], ['yes'], ['Anders Celsius'], ['named after him'], ['The Celsius crater on the Moon is named after him.'], ['0'], ['

In [29]:
# Prompt Style 1 Evaluation
naive_rag.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 21/100 = 0.2100
F1 Score: 0.2262351346280139


In [30]:
# Prompt Style 2 - Persona Prompt
system_prompt = f"You are a concise history teacher."
naive_rag.queries_list = []  # Reset queries_list before new searches
naive_rag.contexts_list = [] 

# Top-3 Retrieval with Generation for 100 queries
count = 0
for row in queries.question:
   naive_rag.search(row, 10, passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)

[['yes'], ['yes'], ['no'], ['18 months'], ['1832'], ['United States Note'], ['Grace Bedell'], ['1789'], ['yes'], ['yes'], ['yes'], ['yes'], ['Springfield'], ['1861'], ['Abraham Lincoln'], ['yes'], ['Ambrose Burnside'], ['freed slaves in territories not under Union control'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['1816'], ['1846'], ['one of the most respected and successful lawyers in Illinois and grew steadily more prosperous.'], ['New Salem'], ['yes'], ['yes'], ['Amedeo Avogadro'], ['john d. scott'], ['1841'], ['1833'], ['yes'], ['yes'], ['no'], ['yes'], ['He was a king of the Italian scientist.'], ["Avogadro 's number is commonly used to compute the results of chemical reactions"], ['Tesla stated in 1925 that::'], ['no'], ['He also wanted to enhance and improve the commercial marine'], ['Anders Celsius'], ['a sailor'], ['1896'], ['no'], ['no'], ['yes'], ['yes'], ['yes'], ['Anders Celsius'], ['named

In [31]:
# Prompt Style 2 Evaluation
naive_rag.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 20/100 = 0.2000
F1 Score: 0.22693044432504045


In [32]:
# Prompt Style 3 - CoT
system_prompt = f"Read the question first. Then retrieve the context from the database. Finally, answer the question using the retrieved context."
naive_rag.queries_list = []  # Reset the queries list
naive_rag.contexts_list = [] 

# Top-3 Retrieval with Generation for 100 queries
count = 0
for row in queries.question:
   naive_rag.search(row, 10,passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)

[['no'], ['Yes'], ['no'], ['18 months'], ['1832'], ['the United States Note'], ['Grace Bedell'], ['1789'], ['yes'], ['no'], ['yes'], ['He believed that this would attract steamboat traffic, which would allow the sparsely populated'], ['Springfield, Illinois'], ['In 1891 Lincoln was appointed President of the United States.'], ['Abraham Lincoln'], ['yes'], ['Ambrose Burnside'], ['freed slaves in territories not under Union control'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['1816'], ['1846'], ['a reputation as a formidable adversary during cross-examinations and in his closing'], ['New Salem'], ['Yes'], ['Yes'], ['Amedeo Avogadro'], ['he dedicated himself to the study of physics and mathematics (then called positive philosophy), and'], ['1841'], ['1833'], ['yes'], ['A noble ancient family of Piedmont, Italy.'], ['No'], ['yes'], ['Yes'], ["Avogadro 's number is commonly used to compute the results of chem

In [33]:
# Prompt Style 3 Evaluation
naive_rag.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 19/100 = 0.1900
F1 Score: 0.22713924656784565


In [34]:
# Top-10 Retrieval for 100 queries (Embedding size 512)
# Prompt Style 1 - Instruction Prompt
system_prompt = f"You are a concise and accurate assistant. Answer the question based on the provided context."
naive_rag2.queries_list = []  # Reset queries_list before new searches
naive_rag2.contexts_list = [] 

# Top-3 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag2.search(row, 10, passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag2.queries_list)

[['yes'], ['yes'], ['no'], ['18 months'], ['February 12, 1809 â April 15, 1865'], ['The legal tender act of 1862'], ['Grace Bedell'], ['1789'], ['yes'], ['yes'], ['no'], ['yes'], ['Perry County'], ['#He was the first President to coin an internationally recognized trademark, although not deliberately.'], ['Lincoln'], ['yes'], ['Michael Korda'], ['to preserve the Union'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['16'], ['1846'], ['23-year'], ['the eight judicial district'], ['yes'], ['no'], ['Amedeo Avogadro'], ['a liceo'], ['1884'], ['During the Second World War'], ['no'], ['yes'], ['no'], ['yes'], ['No, he was a king.'], ["Avogadro 's number is commonly used to compute the results of chemical reactions"], ['Tesla stated in 1925 that::'], ['no'], ['The completion of the Aswan High Dam in 1971 and the resultant Lake Nasser have altered'], ['Anders Celsius'], ['a sailor'], ['1928'], ['no'], ['yes'], ['yes

In [35]:
# Prompt Style 1 Evaluation
naive_rag2.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag2.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 18/100 = 0.1800
F1 Score: 0.22565301081787148


In [36]:
# Prompt Style 2 - Persona Prompt
system_prompt = f"You are a concise history teacher."

naive_rag2.queries_list = []  # Reset queries_list before new searches
naive_rag2.contexts_list = [] 

# Top-3 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag2.search(row, 10, passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag2.queries_list)

[['yes'], ['yes'], ['no'], ['18 months'], ['February 12, 1809 â April 15, 1865'], ['the legal rights of the public'], ['Grace Bedell'], ['1789'], ['yes'], ['yes'], ['no'], ['in a dispute with a shareholder, James A. Barret'], ['Perry County'], ['#He was the first President to coin an internationally recognized trademark, although not deliberately.'], ['a sailor'], ['yes'], ['Michael Korda'], ['to weaken the rebellion by destroying the economic base of its leadership class'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['sixteen'], ['1846'], ['23-year'], ['the eight judicial district'], ['yes'], ['no'], ['Amedeo Avogadro'], ['john d. scott'], ['1881'], ['he was born in 1891'], ['no'], ['yes'], ['no'], ['yes'], ['He was a king of the Italian scientist.'], ["Avogadro 's number is commonly used to compute the results of chemical reactions"], ['Tesla stated in 1925 that::'], ['no'], ['The completion of the Aswan

In [37]:
# Prompt Style 2 Evaluation
naive_rag2.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag2.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 18/100 = 0.1800
F1 Score: 0.22425211033376669


In [38]:
# Prompt Style 3 - CoT
system_prompt = f"Read the question first. Then retrieve the context from the database. Finally, answer the question using the retrieved context."
naive_rag2.queries_list = []  # Reset the queries list
naive_rag2.contexts_list = [] 

# Top-3 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag2.search(row, 10, passage, system_prompt)
   count += 1
   if count == 100:
      break

print(naive_rag2.queries_list)

[['no'], ['Yes'], ['no'], ['18 months'], ['February 12, 1809 â April 15, 1865'], ['The Legal Tender Act of 1862'], ['Grace Bedell'], ['1789'], ['yes'], ['Yes'], ['no'], ['In one prominent 1851 case, he represented the Alton & Sangamon Railroad in'], ['Perry County'], ['#He was the first President to coin an internationally recognized trademark, although not deliberately.'], [':'], ['yes'], ['Michael Korda'], ['He made it clear that the North was fighting the war to preserve the Union, not to abolish slavery'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['16'], ['1846'], ['23-year'], ['the eight judicial district'], ['Yes'], ['no'], ['Amedeo Avogadro'], ['he dedicated himself to the study of physics and mathematics (then called positive philosophy), and'], ['1881'], ['He was born in the year of his birth.'], ['no'], ['A noble ancient family of Piedmont, Italy.'], ['No'], ['yes'], ['Yes'], ["Avogadro 's numb

In [39]:
# Prompt Style 3 Evaluation
naive_rag2.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag2.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 17/100 = 0.1700
F1 Score: 0.22249826594114522


# Advanced Evaluation using RAGAs

In [40]:
# Claude code helped provide the structure for the data and for the evaluation code below. See Appendix S.
data = {
    "question": queries.question[:100].tolist() ,                     # Question
    "answer": naive_rag.flatten_answer ,                       # Generated Answer
    "contexts": naive_rag.contexts_list ,                     # Context you pass in. You can just use top-1 here
    "reference": [truth for truth in queries.answer[:100].tolist()]                  # Reference Answer in the dataset (Human annotated)
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

In [41]:
# Pass the dataset above to the evaluate method in RAGAs
# Your code here
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
import os

load_dotenv()

gemini_llm = LangchainLLMWrapper(ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0))
gemini_embeddings = LangchainEmbeddingsWrapper(GoogleGenerativeAIEmbeddings(model="models/embedding-001"))



# Evaluate using Gemini
results = evaluate(
    dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_recall,
        context_precision,
    ],
    llm=gemini_llm,
    embeddings=gemini_embeddings,
)

print("RAGAs Evaluation Results:")
print(results)



E0000 00:00:1759542404.521797  805789 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1759542404.551809  805789 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1759542404.556713  805789 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
  gemini_embeddings = LangchainEmbeddingsWrapper(GoogleGenerativeAIEmbeddings(model="models/embedding-001"))
Evaluating:   0%|          | 0/400 [00:00<?, ?it/s]E0000 00:00:1759542404.974720  805789 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
Exception raised in Job[1]: IndexError(list index out of range)
Exception raised in Job[9]: IndexError(list index out of range)
Evaluating:   0%|          | 1/400 [00:01<07:45,  1.17s/it]Exception raised in Job[5]: IndexError(list index out of range)
Exception raised in Job[13]: IndexError(list 

RAGAs Evaluation Results:
{'faithfulness': 0.4823, 'answer_relevancy': 0.7384, 'context_recall': 0.4100, 'context_precision': 0.3700}
