In [1]:
# Claude suggested I add in the system path to help find the py module. See Appendix K. 
import sys
sys.path.append('/Users/shoyou100/Visual-Code-Studio-workspace/assignment2-rag/src')
from langchain_google_genai import ChatGoogleGenerativeAI
import pandas as pd
from naive_rag import NaiveRAG  
from evaluation import Evaluation
from enhanced_rag import EnhancedRAG
from datasets import Dataset

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)


  from .autonotebook import tqdm as notebook_tqdm


# Read Passages from the Datasets and Drop rows if they are NA or empty

In [2]:
passages = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")

print(passages.shape)
passages.head()

(3200, 1)


Unnamed: 0_level_0,passage
id,Unnamed: 1_level_1
0,"Uruguay (official full name in ; pron. , Eas..."
1,"It is bordered by Brazil to the north, by Arge..."
2,Montevideo was founded by the Spanish in the e...
3,The economy is largely based in agriculture (m...
4,"According to Transparency International, Urugu..."


In [3]:
queries = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")
print(queries.shape)
queries.head()

(918, 2)


Unnamed: 0_level_0,question,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Was Abraham Lincoln the sixteenth President of...,yes
2,Did Lincoln sign the National Banking Act of 1...,yes
4,Did his mother die of pneumonia?,no
6,How many long was Lincoln's formal education?,18 months
8,When did Lincoln begin his political career?,1832


In [4]:
# Claude recommended I incorporate chunking using LangChain. See Appendix M.
advanced_rag = EnhancedRAG(200,25)

chunk_metadata = []  # Track which passage each chunk came from

for idx, passage_text in enumerate(passages['passage']):
    chunks = advanced_rag.splitTexts(passage_text)
    advanced_rag.addChunks(chunks)
    
    # Keep track of source passage for each chunk
    for chunk in chunks:
        chunk_metadata.append({
            'passage_id': passages['id'][idx] if 'id' in passages else idx,
            'chunk_text': chunk
        })

In [5]:
print(f"Original passages: {len(passages)}")
print(f"Total chunks: {advanced_rag.getChunkLength()}")

Original passages: 3200
Total chunks: 8476


In [6]:
naive_rag = NaiveRAG('all-MiniLM-L6-v2', 'google/flan-t5-small', 'rag_wikipedia_mini.db', 'rag_mini')

passage_embeddings = naive_rag.embedding_model.encode(advanced_rag.getChunks()) 
print(naive_rag.embedding_model.get_sentence_embedding_dimension())
query_embeddings = naive_rag.embedding_model.encode(queries['question'].tolist())
print(query_embeddings)
print(naive_rag.embedding_model.get_sentence_embedding_dimension())

Embedding model loaded.
Tokenizer loaded.
Schema created.
Seq2Seq model loaded.
Milvus client connected.
Dropping existing collection 'rag_mini'...
Collection dropped.
384
[[-0.00375673  0.03796374 -0.03182048 ... -0.04027529  0.00659006
   0.03112675]
 [-0.03231501 -0.0094521  -0.10944731 ... -0.01333902 -0.0162666
  -0.01005705]
 [ 0.00882224  0.0258874  -0.01894972 ... -0.03792767  0.07239034
   0.00896565]
 ...
 [-0.00906186 -0.04410971 -0.11035763 ... -0.03789582  0.03106767
   0.06000853]
 [ 0.02661895  0.05995833 -0.08694381 ...  0.0077695   0.01058571
  -0.01678987]
 [-0.00681225  0.05897506 -0.05983922 ... -0.01872636 -0.01558293
   0.01479083]]
384


# Create Milvus Client and Insert your Embeddings to your DB
- Make sure you define a schema for your collection (Points will be deducted if you fail to define a proper schema with ids, passage text, embedding)

In [7]:
# Claude recommended I convert the chunks to a dataframe and use that for database creation. See Appendix N.
chunks_df = pd.DataFrame(chunk_metadata)
chunks_df.head()

Unnamed: 0,passage_id,chunk_text
0,0,"Uruguay (official full name in ; pron. , Eas..."
1,0,1.7 million live in the capital Montevideo and...
2,1,"It is bordered by Brazil to the north, by Arge..."
3,1,Atlantic Ocean to the southeast. It is the sec...
4,2,Montevideo was founded by the Spanish in the e...


In [8]:
# Redefine id_ to ensure it is available
id_ = chunks_df.index.tolist()
passage = chunks_df['chunk_text'].tolist()
embedding = passage_embeddings.tolist()
naive_rag.create_dataBase(chunks_df, 'chunk_text', passage_embeddings, 384)

Columns defined.
Fields added to schema.
Collection created.
Data inserted successfully.
Index created successfully.
Collection loaded into memory


In [9]:
naive_rag.sanityCheck('rag_mini')

Entity count: 8476
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


# Steps to Fetch Results
- Read the Question Dataset
- Clean the Question Dataset if necessary (Drop Questions with NaN etc.)
- Convert Each Query to a Vector Embedding (Use the same embedding model you used to embed your document)
- Try for a Single Question First
- Load Collection into Memory after creating Index for Search on your embedding field (This is an essential step before you can search in your db)
- Search and Fetch Top N Results

**Develop your Prompt**

# Generate Responses for 100 queries in the Dataset.
# Top-1 Passage Selection

In [10]:
# Prompt Style 1 - Instruction Prompt
system_prompt = f"You are a concise and accurate assistant. Answer the question based on the provided context."
naive_rag.queries_list = []  # Reset queries_list before new searches
naive_rag.contexts_list = [] 
# Top-1 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag.search(row,3, passage, system_prompt, use_enhanced=True, enhancedRag=advanced_rag)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)

[['yes'], ['yes'], ['no'], ['18 months'], ['1832'], ['United:'], ['Grace Bedell'], ['1789'], ['yes'], ['yes'], ['yes'], ['yes'], ['Hardin'], ['1861'], ['Abraham Lincoln'], ['yes'], ['John C. Pemberton'], ['Because Lincoln was a rebel.'], ['yes'], ['yes'], ['yes'], ['18'], ['1846'], ['23-year'], ["The Farmer's Almanac"], ['yes'], ['no'], ['Lorenzo Romano Amedeo Carlo Avogadro'], ['a priest'], ['1821'], ['twenty years'], ['yes'], ['yes'], ['yes'], ['yes'], ['King Victor Emmanuel III was there to pay homage to Avogadro.'], ['It allows chemists to determine the exact amounts of substances:'], ['The theory was never published'], ['no'], ['Mississippi, and severed the rail line to Vicksburg'], ['Carolus Linnaeus'], ['Vercelli'], ['1693'], ['yes'], ['no'], ['yes'], ['yes'], ['yes'], ['Anders Celsius'], ['named after him'], ['The Celsius crater on the Moon is named after him.'], ['No'], ['Celsius was born in Uppsala in Sweden.'], ['no'], ['He died in 1774'], ['the persecution of Port-Royal had

# Finding out the Basic QA Metrics (F1 score, EM score)

In [11]:
naive_rag.calculateEM(queries.answer)

EM Score: 25/100 = 0.2500


0.25

In [12]:
# F1 Score Calculation
evaluator = Evaluation()
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

F1 Score: 0.31292990342990346


# Persona Styles 2 & 3 - Top-1

In [13]:
# Prompt Style 2 - Persona Prompt
system_prompt = f"You are a concise history teacher."

naive_rag.queries_list = []  # Reset the queries list
naive_rag.contexts_list = [] 

# Top-1 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag.search(row, 3, passage, system_prompt, use_enhanced=True, enhancedRag=advanced_rag)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)


[['yes'], ['yes'], ['no'], ['18 months'], ['1832'], ['United:'], ['Grace Bedell'], ['1789'], ['yes'], ['yes'], ['yes'], ['yes'], ['Hardin'], ['1861'], ['Abraham Lincoln'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['John C. Pemberton'], ['to protect rebels'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['1816'], ['1846'], ['23-year'], ['the sands'], ['yes'], ['no'], ['Lorenzo Romano Amedeo Carlo Avogadro'], ['john d. scott'], ['1821'], ['twenty years'], ['yes'], ['yes'], ['yes'], ['yes'], ['King Victor Emmanuel III was there to pay homage to Avogadro.'], ['It allows chemists to determine the exact amounts of substances:'], ['The theory was never published'], ['no'], ['Mississippi, and severed the rail line to Vicksburg'], ['Carolus Linnaeus'], ['Vercelli'], ['1693'], ['no'], ['no'], ['yes'], ['yes'], ['yes'], ['Anders Celsius'], ['named after h

In [14]:
# Prompt Style 2 Evaluation
naive_rag.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")


EM Score: 24/100 = 0.2400
F1 Score: 0.3050965700965701


In [15]:
# Prompt Style 3 - CoT
system_prompt = f"Read the question first. Then retrieve the context from the database. Finally, answer the question using the retrieved context."
naive_rag.queries_list = []  # Reset the queries list
naive_rag.contexts_list = [] 

# Top-1 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag.search(row, 3, passage, system_prompt, use_enhanced=True, enhancedRag=advanced_rag)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)

[['yes'], ['yes'], ['no'], ['18 months'], ['1832'], ['provided government grants for agricultural universities in each state'], ['Grace Bedell'], ['1789'], ['yes'], ['no'], ['yes'], ['Yes'], ['Hardin'], ['In 1891 Lincoln was appointed President of the United States.'], ['Abraham Lincoln'], ['He was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['John C. Pemberton'], ['To protect rebels'], ['yes'], ['yes'], ['He was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['1816'], ['1846'], ['23-year'], ["Lincoln used a Farmers' Almanac in the northwestern part of the country"], ['Yes'], ['no'], ['Lorenzo Romano Amedeo Carlo Avogadro, Count'], ['ordained'], ['1821'], ['for another twenty years'], ['yes'], ['He is most noted for his contributions to the theory of molarity and molecular:'], ['Yes'], ['yes'], ['King Victor Emmanuel III was there to pay homage to Avogadro.'], ['It allows chemists to determine 

In [16]:
# Prompt Style 3 Evaluation
naive_rag.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 26/100 = 0.2600
F1 Score: 0.3081832141500334


# Top-10 Passage Selection

In [17]:
# Prompt Style 1 - Instruction Prompt
system_prompt = f"You are a concise and accurate assistant. Answer the question based on the provided context."
naive_rag.queries_list = []  # Reset queries_list before new searches
naive_rag.contexts_list = [] 
# Top-1 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag.search(row,10, passage, system_prompt, use_enhanced=True, enhancedRag=advanced_rag)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)

[['yes'], ['yes'], ['yes'], ['18 months'], ['1832'], ['United'], ['Grace Bedell'], ['1851'], ['yes'], ['yes'], ['yes'], ['yes'], ['Hardin'], ['1861'], ['Abraham Lincoln'], ['yes'], ['John C. Pemberton'], ['Because Lincoln was a rebel.'], ['yes'], ['yes'], ['yes'], ['18'], ['1846'], ['23-year'], ["Lincoln's Almanac"], ['yes'], ['no'], ['Lorenzo Romano Amedeo Carlo Avogadro'], ['Abraham Lincoln'], ['1821'], ['twenty years'], ['yes'], ['yes'], ['no'], ['yes'], ['No'], ["Avogadro 's number is commonly used to compute the results of chemical reactions"], ['The theory was never published'], ['Yes'], ['Mississippi, and severed the rail line to Vicksburg'], ['Carolus Linnaeus'], ['Roosevelt'], ['1693'], ['yes'], ['no'], ['yes'], ['no'], ['yes'], ['Celsius'], ['Volta'], ['The observatory of Anders Celsius, from a contemporary engraving.'], ['yes'], ['No'], ['no'], ['Middle East'], ['the persecution of Port-Royal had ceased'], ['yes'], ['Yes'], ['yes'], ['head, the thorax, and the abdomen'], ['a

In [18]:
# Prompt Style 1 Evaluation
naive_rag.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

EM Score: 20/100 = 0.2000
F1 Score: 0.29570093042604484


In [19]:
# Prompt Style 2 - Persona Prompt
system_prompt = f"You are a concise history teacher."

naive_rag.queries_list = []  # Reset the queries list
naive_rag.contexts_list = [] 

# Top-1 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag.search(row, 10, passage, system_prompt, use_enhanced=True, enhancedRag=advanced_rag)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)

# Prompt Style 2 Evaluation
naive_rag.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")


[['yes'], ['yes'], ['yes'], ['18 months'], ['1832'], ['United'], ['Grace Bedell'], ['1851'], ['yes'], ['yes'], ['yes'], ['yes'], ['Hardin'], ['1861'], ['Abraham Lincoln'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['John C. Pemberton'], ['to protect rebels'], ['yes'], ['yes'], ['Lincoln was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['1816'], ['1846'], ['23-year'], ["Lincoln's Almanac"], ['yes'], ['no'], ['Lorenzo Romano Amedeo Carlo Avogadro'], ['Abraham Lincoln'], ['1821'], ['twenty years'], ['yes'], ['no'], ['no'], ['yes'], ['Avogadro submitted his poem to a French journal.'], ["Avogadro 's number is commonly used to compute the results of chemical reactions"], ['The theory was never published'], ['yes'], ['Mississippi, and severed the rail line to Vicksburg'], ['Carolus Linnaeus'], ['Roosevelt'], ['1693'], ['no'], ['no'], ['yes'], ['no'], ['yes'], ['Celsius'], ['Volta'],

In [20]:
# Prompt Style 3 - CoT
system_prompt = f"Read the question first. Then retrieve the context from the database. Finally, answer the question using the retrieved context."
naive_rag.queries_list = []  # Reset the queries list
naive_rag.contexts_list = [] 

# Top-1 Retrieval with Generation for all queries
count = 0
for row in queries.question:
   naive_rag.search(row, 10, passage, system_prompt, use_enhanced=True, enhancedRag=advanced_rag)
   count += 1
   if count == 100:
      break

print(naive_rag.queries_list)

# Prompt Style 3 Evaluation
naive_rag.calculateEM(queries.answer)
f1_score = evaluator.compute_f1(naive_rag.flatten_answer, queries.answer.tolist())
print(f"F1 Score: {f1_score}")

[['yes'], ['yes'], ['yes'], ['18 months'], ['1832'], ['provided government grants for agricultural universities in each state'], ['Grace Bedell'], ['1851'], ['yes'], ['no'], ['yes'], ['Yes'], ['Hardin'], ['In 1891 Lincoln was appointed President of the United States.'], ['Abraham Lincoln'], ['He was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['John C. Pemberton'], ['To protect rebels'], ['yes'], ['yes'], ['He was eventually chosen as the Republican candidate for the 1860 election for several reasons.'], ['1816'], ['1846'], ['23-year'], ["Lincoln used a Farmers' Almanac in the northwestern part of the country"], ['Yes'], ['no'], ['Lorenzo Romano Amedeo Carlo Avogadro, Count'], ['Abraham Lincoln'], ['1821'], ['for another twenty years'], ['yes'], ['no'], ['no'], ['yes'], ['Avogadro submitted his poem to a French journal.'], ["Avogadro 's number is commonly used to compute the results of chemical reactions"], ['The theory was never publishe

# Advanced Evaluation using RAGAs

In [21]:
# Claude code helped provide the structure for the data and for the evaluation code below. See Appendix S.
data = {
    "question": queries.question[:100].tolist() ,                     # Question
    "answer": naive_rag.flatten_answer ,                       # Generated Answer
    "contexts": naive_rag.contexts_list ,                     # Context you pass in. You can just use top-1 here
    "reference": [truth for truth in queries.answer[:100].tolist()]                  # Reference Answer in the dataset (Human annotated)
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

In [22]:
# Pass the dataset above to the evaluate method in RAGAs
# Your code here
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
import os

load_dotenv()

gemini_llm = LangchainLLMWrapper(ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0))
gemini_embeddings = LangchainEmbeddingsWrapper(GoogleGenerativeAIEmbeddings(model="models/embedding-001"))



# Evaluate using Gemini
results = evaluate(
    dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_recall,
        context_precision,
    ],
    llm=gemini_llm,
    embeddings=gemini_embeddings,
)

print("RAGAs Evaluation Results:")
print(results)



E0000 00:00:1759543245.259053  820528 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1759543245.273760  820528 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
  gemini_embeddings = LangchainEmbeddingsWrapper(GoogleGenerativeAIEmbeddings(model="models/embedding-001"))
E0000 00:00:1759543245.276608  820528 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
Evaluating:   0%|          | 0/400 [00:00<?, ?it/s]E0000 00:00:1759543245.631088  820528 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
Exception raised in Job[5]: IndexError(list index out of range)
Evaluating:   0%|          | 1/400 [00:03<22:38,  3.40s/it]Exception raised in Job[1]: IndexError(list index out of range)
Exception raised in Job[13]: IndexError(list index out of range)
Exception raised in Job[9]: IndexError(list 

RAGAs Evaluation Results:
{'faithfulness': 0.5483, 'answer_relevancy': 0.6813, 'context_recall': 0.5600, 'context_precision': 0.5100}
