In [1]:
# !pip install sentence_transformers
# !pip install "pymilvus[milvus_lite]"
# !pip install evaluate
# !pip install langchain_huggingface

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from tqdm import tqdm
from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType
import transformers, torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
import ast
import evaluate as hf_evaluate
from datasets import Dataset

from langchain_huggingface import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import pipeline

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

## Step1: Index

In [5]:
df = pd.read_csv("/Users/connie/Desktop/Fall 2025/LLM/Assignment2/data/processed/rag_mini_wiki.csv")

In [6]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode all passages into vector embeddings
embeddings = embedding_model.encode(
    df["passages"].tolist(),
    batch_size=64,
    convert_to_numpy=True
)

print("Embeddings shape:", embeddings.shape) 

Embeddings shape: (3200, 384)


### Create Milvus Client

In [8]:
# Initialize Milvus client
client = MilvusClient("rag_wikipedia_mini.db")

In [9]:
def rebuild_milvus_index(client, df, embedding_model, model_name_str):
    """
    Re-encodes data, rebuilds the Milvus collection, and creates the index
    for the given embedding model to support Step 4 experimentation.
    """
    
    print(f"Rebuilding Milvus Index for {model_name_str} ---")
    
    # 1. Encode all passages with the CURRENT embedding model
    embeddings = embedding_model.encode(df["passages"].tolist(), batch_size=64)
    vector_dim = embeddings.shape[1]
    
    # 2. Define the Schema using the NEW dimension
    id = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False)
    passage = FieldSchema(name="passage", dtype=DataType.VARCHAR, max_length=3000) 
    embedding = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=vector_dim) 
    schema = CollectionSchema(fields=[id, passage, embedding], description="Experiment passages")

    # 3. Drop and Recreate Collection
    if client.has_collection("rag_mini"):
        client.drop_collection("rag_mini")
        
    client.create_collection(collection_name="rag_mini", schema=schema, consistency_level="Strong")
    
    # 4. Prepare and Insert New Data
    rag_data = [{"id": i, "passage": df.iloc[i]["passages"], "embedding": embeddings[i].tolist()} for i in range(len(df))]
    client.insert(collection_name="rag_mini", data=rag_data)
    
    # 5. Create Index
    index_params = MilvusClient.prepare_index_params()
    index_params.add_index(field_name="embedding", index_type="IVF_FLAT", metric_type="COSINE", params={"nlist": 128})
    client.create_index("rag_mini", index_params=index_params)
    
    # 6. Load Collection
    client.load_collection("rag_mini")
    
    # --- REQUIRED PRINTOUTS ---
    print("Entity count:", client.get_collection_stats("rag_mini")["row_count"])
    print("Collection schema:", client.describe_collection("rag_mini"))
    print("Milvus rebuilt and ready for search.")
    
    return embedding_model # Return the initialized model

In [10]:
EMBEDDING_MODELS = {
    "dim384": "all-MiniLM-L6-v2", 
    "dim768": "all-mpnet-base-v2"
}

In [11]:
BASE_MODEL_NAME = EMBEDDING_MODELS['dim384']
BASE_EMBEDDING_MODEL = SentenceTransformer(BASE_MODEL_NAME)

rebuild_milvus_index(client, df, BASE_EMBEDDING_MODEL, BASE_MODEL_NAME)

Rebuilding Milvus Index for all-MiniLM-L6-v2 ---
Entity count: 3200
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'Experiment passages', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 3000}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': False}
Milvus rebuilt and ready for search.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

### Prepare QA Dataset

In [13]:
qa_df = pd.read_csv("/Users/connie/Desktop/Fall 2025/LLM/Assignment2/data/processed/rag_mini_wiki_qa.csv")

In [14]:
qa_df.columns

Index(['test'], dtype='object')

In [15]:
qa_df.head()

Unnamed: 0,test
0,{'question': 'Was Abraham Lincoln the sixteent...
1,{'question': 'Did Lincoln sign the National Ba...
2,{'question': 'Did his mother die of pneumonia?...
3,"{'question': ""How many long was Lincoln's form..."
4,{'question': 'When did Lincoln begin his polit...


In [16]:
qa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   test    918 non-null    object
dtypes: object(1)
memory usage: 7.3+ KB


In [17]:
type(qa_df.iloc[0]["test"])

str

In [18]:
# Turn string back to dict type
qa_df["test"] = qa_df["test"].apply(ast.literal_eval)

qa_df["question"] = qa_df["test"].apply(lambda x: x["question"])
qa_df["answer"] = qa_df["test"].apply(lambda x: x["answer"])
qa_df = qa_df.drop(columns=["test"])

In [19]:
qa_df.head()

Unnamed: 0,question,answer
0,Was Abraham Lincoln the sixteenth President of...,yes
1,Did Lincoln sign the National Banking Act of 1...,yes
2,Did his mother die of pneumonia?,no
3,How many long was Lincoln's formal education?,18 months
4,When did Lincoln begin his political career?,1832


In [20]:
qa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  918 non-null    object
 1   answer    918 non-null    object
dtypes: object(2)
memory usage: 14.5+ KB


In [21]:
# === Stratified sampling: 40 short / 40 medium / 40 long ===
qa_df["q_len"] = qa_df["question"].apply(lambda x: len(str(x).split()))
bins = np.percentile(qa_df["q_len"], [33, 66])
qa_df["q_bin"] = pd.cut(
    qa_df["q_len"],
    bins=[-1, bins[0], bins[1], float("inf")],
    labels=["short", "medium", "long"]
)

In [22]:
subset_df = qa_df.groupby("q_bin", group_keys=False).apply(
    lambda x: x.sample(min(40, len(x)), random_state=42)
).reset_index(drop=True)


  subset_df = qa_df.groupby("q_bin", group_keys=False).apply(
  subset_df = qa_df.groupby("q_bin", group_keys=False).apply(


In [23]:
print(f"\n--- Using {len(subset_df)} stratified queries for ALL columns ---")
print(subset_df["q_bin"].value_counts())


--- Using 120 stratified queries for ALL columns ---
q_bin
short     40
medium    40
long      40
Name: count, dtype: int64


In [24]:
# IMPORTANT: From here on, we ONLY use subset_df instead of qa_df
qa_df = subset_df  
qa_df.head()

Unnamed: 0,question,answer,q_len,q_bin
0,What did James Monroe make in 1817?,two long tours,7,short
1,Are Gray Wolves native to North America?,Yes,7,short
2,Is English the official language?,yes,5,short
3,How long is the elephant's gestation period?,22 months,7,short
4,Are diving ducks heavier tha dabbling ducks?,Yes,7,short


## STEP 2: RETRIEVAL

### Single Query Example

In [27]:
# Get the first question for a demonstration
query = qa_df.iloc[0]["question"]

# convert query into an embedding vector
query_embedding = embedding_model.encode(query)
print(query_embedding.shape)

(384,)


In [28]:
# Convert to list so it can be passed into Milvus
query_embedding_list = query_embedding.tolist()

In [29]:
# Search the database with the query embedding
output_ = client.search(
    collection_name="rag_mini",
    data=[query_embedding_list],
    anns_field="embedding",
    search_params={"metric_type": "COSINE", "params": {"nprobe": 10}},  # search params
    limit=3,   # retrieve Top-3 passages
    output_fields=["id", "passage"]
)

print(output_)

data: [[{'id': 2520, 'distance': 0.6385066509246826, 'entity': {'id': 2520, 'passage': '{\'passage\': "Monroe had racked up many debts during his years of public life. As a result, he was forced to sell off his Highland Plantation (now called Ash Lawn-Highland; it is owned by his alma mater, the College of William and Mary, which has opened it to the public). Throughout his life, he was not financially solvent, and his wife\'s poor health made matters worse.  For these reasons, he and his wife lived in Oak Hill until Elizabeth\'s death on September 23, 1830.", \'id\': 2521}'}}, {'id': 2506, 'distance': 0.6370433568954468, 'entity': {'id': 2506, 'passage': "{'passage': 'The Presidentâ\\x80\\x99s parents, father Spence Monroe (ca. 1727 1774), a woodworker and tobacco farmer, and mother Elizabeth Jones Monroe had significant land holdings but little money. Like his parents, he was a slaveholder. Born in Westmoreland County, Virginia, Monroe went to school at Campbelltown Academy and then 

## STEP 3: GENERATION

In [31]:
# Take the top-1 passage as the final context for the LLM
context = output_[0][0]['entity']['passage']

system_prompt = "You are a helpful assistant. Use the context to answer the question. Answer with yes or no, then explain briefly."

# Construct the RAG Prompt
prompt = f"""{system_prompt} \n Context: {context}: \n Question: {query} """
print(prompt)

You are a helpful assistant. Use the context to answer the question. Answer with yes or no, then explain briefly. 
 Context: {'passage': "Monroe had racked up many debts during his years of public life. As a result, he was forced to sell off his Highland Plantation (now called Ash Lawn-Highland; it is owned by his alma mater, the College of William and Mary, which has opened it to the public). Throughout his life, he was not financially solvent, and his wife's poor health made matters worse.  For these reasons, he and his wife lived in Oak Hill until Elizabeth's death on September 23, 1830.", 'id': 2521}: 
 Question: What did James Monroe make in 1817? 


### RAG Response for a Single Query

In [33]:
# Load the LLM Model
generator = pipeline("text2text-generation", model="google/flan-t5-base")

# Generate answer for a single query
single_output = generator(prompt, max_new_tokens=100, do_sample=False)

# Decode and extract answer
answer_single = single_output[0]["generated_text"]
print("Generated Answer:", answer_single)

Device set to use mps:0


Generated Answer: no


### BATCH RAG: Retrieve and Generate Responses for all Queries

In [35]:
# --- Prompting Strategies Definitions ---

# 1. Naive Prompt (Baseline, similar to your original, but simplified for clarity)
PROMPT_NAIVE = """
You are a helpful assistant. Use the context provided below to answer the question. Only use information found in the context.

Context: {context}
Question: {question}
"""

# 2. Chain-of-Thought (CoT) Prompt (Forces step-by-step reasoning)
# This format ensures the model provides reasoning before the final answer, which is often more accurate.
PROMPT_COT = """
You are an expert analytical assistant.
INSTRUCTIONS: First, critically analyze the context and the question. Generate a detailed, step-by-step reasoning process to determine the answer. 
Finally, provide the concise answer on a new line, labeled "Final Answer:". 
Your full response must include the reasoning steps.

Context: {context}
Question: {question}
"""

# 3. Persona Prompting (Assigns a specific role to influence tone/focus)
# Here, we ask the model to be a "Concise Historian" for factual RAG.
PROMPT_PERSONA = """
You are a historical expert known for concise and authoritative answers. 
Your primary goal is to provide a brief, factual answer based ONLY on the provided context. 
If the context does not contain the answer, state "Information not available in the context."

Context: {context}
Question: {question}
"""

In [36]:
# --- Experimental configuration ---
STRATEGIES_TO_TEST = {
    "naive": PROMPT_NAIVE,
    "cot": PROMPT_COT,
    "persona": PROMPT_PERSONA
}

RETRIEVAL_K_VALUES = [1, 3, 5] 

In [37]:
def run_embedding_experiment(embedding_model, embedding_model_name, qa_df, client, generator):
    """
    Runs the K x Prompting grid search for a single, already indexed embedding model.
    """
    print(f"\n--- Starting Experiment for Embedding Model: {embedding_model_name} ---")
    
    qa_df_copy = qa_df.copy() 
    questions_list = qa_df_copy["question"].tolist()
    contexts_storage = {}
    
    # --- Nested Loops: K x Prompting Strategy ---
    for K in RETRIEVAL_K_VALUES:
        for strategy_name, system_prompt_template in STRATEGIES_TO_TEST.items():
            
            print(f"\n-> Running K={K}, Strategy={strategy_name} with {embedding_model_name}")
            generated_answers = []
            
            for i in tqdm(range(len(qa_df_copy)), desc=f"K={K}, {strategy_name}"):
                q = qa_df_copy.iloc[i]["question"]
                
                # Retrieval Step: Embed the query with the current model
                # Uses the model object passed directly: embedding_model.encode(q)
                q_emb = embedding_model.encode(q).tolist() # CORRECT
                
                results = client.search(
                    collection_name="rag_mini",
                    data=[q_emb],
                    anns_field="embedding",
                    search_params={"metric_type": "COSINE", "params": {"nprobe": 10}}, 
                    limit=K,  # Use the K value from the outer loop
                    output_fields=["passage", "id"]
                )

                # Context Combination
                passages = [hit["entity"]["passage"] for hit in results[0]]
                ctx = "\n---\n".join(passages)
                
                # Store K=1 context for the RAGAS baseline (only store the baseline model's K=1 context)
                if K == 1 and embedding_model_name == EMBEDDING_MODELS['dim384']:
                    contexts_storage[i] = ctx
                
                # Generation
                prompt = system_prompt_template.format(context=ctx, question=q)
                out = generator(prompt, max_new_tokens=200, do_sample=False)
                ans = out[0]["generated_text"]
                generated_answers.append(ans)
            
            # Store results
            qa_df_copy[f"gen_{strategy_name}_k{K}_{embedding_model_name}"] = generated_answers
    
    return qa_df_copy, contexts_storage

In [38]:
final_experiment_results = pd.DataFrame()
final_contexts = None

for dim_key, model_name in EMBEDDING_MODELS.items():
    
    # 1. Initialize the specific embedding model
    current_embedding_model = SentenceTransformer(model_name)
    
    # 2. Rebuild the Milvus index for this specific model's vectors
    rebuild_milvus_index(client, df, current_embedding_model, model_name)
    
    # 3. Run the experiment using the model we just indexed with
    df_results, contexts_storage = run_embedding_experiment(
        current_embedding_model,        # Pass the initialized model
        model_name,                     # Pass the model name string
        qa_df, 
        client, 
        generator
    )
    
    # 4. Merge results (rest of the logic is correct)
    if final_experiment_results.empty:
        final_experiment_results = df_results
        final_contexts = contexts_storage
    else:
        for col in [c for c in df_results.columns if c.startswith('gen_')]:
            final_experiment_results[col] = df_results[col]

Rebuilding Milvus Index for all-MiniLM-L6-v2 ---
Entity count: 3200
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'Experiment passages', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 3000}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': False}
Milvus rebuilt and ready for search.

--- Starting Experiment for Embedding Model: all-MiniLM-L6-v2 ---

-> Running K=1, Strategy=naive with all-MiniLM-L6-v2


K=1, naive: 100%|█████████████████████████████| 120/120 [00:55<00:00,  2.16it/s]



-> Running K=1, Strategy=cot with all-MiniLM-L6-v2


K=1, cot: 100%|███████████████████████████████| 120/120 [01:57<00:00,  1.02it/s]



-> Running K=1, Strategy=persona with all-MiniLM-L6-v2


K=1, persona: 100%|███████████████████████████| 120/120 [00:47<00:00,  2.53it/s]



-> Running K=3, Strategy=naive with all-MiniLM-L6-v2


K=3, naive:   2%|▌                              | 2/120 [00:01<01:58,  1.00s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
K=3, naive: 100%|█████████████████████████████| 120/120 [01:22<00:00,  1.46it/s]



-> Running K=3, Strategy=cot with all-MiniLM-L6-v2


K=3, cot: 100%|███████████████████████████████| 120/120 [02:38<00:00,  1.32s/it]



-> Running K=3, Strategy=persona with all-MiniLM-L6-v2


K=3, persona: 100%|███████████████████████████| 120/120 [01:33<00:00,  1.28it/s]



-> Running K=5, Strategy=naive with all-MiniLM-L6-v2


K=5, naive: 100%|█████████████████████████████| 120/120 [02:11<00:00,  1.09s/it]



-> Running K=5, Strategy=cot with all-MiniLM-L6-v2


K=5, cot: 100%|███████████████████████████████| 120/120 [03:36<00:00,  1.80s/it]



-> Running K=5, Strategy=persona with all-MiniLM-L6-v2


K=5, persona: 100%|███████████████████████████| 120/120 [01:53<00:00,  1.06it/s]


Rebuilding Milvus Index for all-mpnet-base-v2 ---
Entity count: 3200
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'Experiment passages', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 3000}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': False}
Milvus rebuilt and ready for search.

--- Starting Experiment for Embedding Model: all-mpnet-base-v2 ---

-> Running K=1, Strategy=naive with all-mpnet-base-v2


K=1, naive: 100%|█████████████████████████████| 120/120 [01:20<00:00,  1.49it/s]



-> Running K=1, Strategy=cot with all-mpnet-base-v2


K=1, cot: 100%|███████████████████████████████| 120/120 [01:48<00:00,  1.11it/s]



-> Running K=1, Strategy=persona with all-mpnet-base-v2


K=1, persona: 100%|███████████████████████████| 120/120 [00:44<00:00,  2.72it/s]



-> Running K=3, Strategy=naive with all-mpnet-base-v2


K=3, naive: 100%|█████████████████████████████| 120/120 [01:37<00:00,  1.23it/s]



-> Running K=3, Strategy=cot with all-mpnet-base-v2


K=3, cot: 100%|███████████████████████████████| 120/120 [02:45<00:00,  1.38s/it]



-> Running K=3, Strategy=persona with all-mpnet-base-v2


K=3, persona: 100%|███████████████████████████| 120/120 [01:10<00:00,  1.69it/s]



-> Running K=5, Strategy=naive with all-mpnet-base-v2


K=5, naive: 100%|█████████████████████████████| 120/120 [01:46<00:00,  1.13it/s]



-> Running K=5, Strategy=cot with all-mpnet-base-v2


K=5, cot: 100%|███████████████████████████████| 120/120 [03:04<00:00,  1.53s/it]



-> Running K=5, Strategy=persona with all-mpnet-base-v2


K=5, persona: 100%|███████████████████████████| 120/120 [02:07<00:00,  1.06s/it]


### Final Data Preparation for Evaluation

In [71]:
final_experiment_results["contexts"] = [final_contexts[i] for i in range(len(qa_df))] 

# Save the full DataFrame containing ALL experimental results
os.makedirs("../data/evaluation", exist_ok=True)
final_experiment_results.to_csv("../data/evaluation/all_experiment_generations.csv", index=False)
print("\nBatch generation complete. All experimental results saved.")


Batch generation complete. All experimental results saved.
