In [1]:
import os
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from pymilvus import MilvusClient



In [2]:
# 1. Path settings
INPUT_PATH  = "../data/evaluation/all_experiment_generations.csv"
OUTPUT_PATH = "../data/evaluation/all_experiment_generations_for_ragas.csv"
DB_PATH     = "rag_wikipedia_mini.db" 

if not os.path.exists(INPUT_PATH):
    raise FileNotFoundError(f"Input file not found: {INPUT_PATH}")

print("Reading file:", INPUT_PATH)


Reading file: ../data/evaluation/all_experiment_generations.csv


In [3]:
# 2. Load CSV
df = pd.read_csv(INPUT_PATH)

print("Original columns:", df.columns.tolist()[:10], "...")
print("Total rows:", len(df))

Original columns: ['question', 'answer', 'q_len', 'q_bin', 'gen_naive_k1_all-MiniLM-L6-v2', 'gen_cot_k1_all-MiniLM-L6-v2', 'gen_persona_k1_all-MiniLM-L6-v2', 'gen_naive_k3_all-MiniLM-L6-v2', 'gen_cot_k3_all-MiniLM-L6-v2', 'gen_persona_k3_all-MiniLM-L6-v2'] ...
Total rows: 120


In [4]:
# 3. Initialize Milvus + Embedding
embedding_model = SentenceTransformer("all-mpnet-base-v2")
client = MilvusClient(DB_PATH)

print("Embedding model:", "all-mpnet-base-v2")
print("Milvus DB:", os.path.abspath(DB_PATH))

Embedding model: all-mpnet-base-v2
Milvus DB: /Users/connie/Desktop/Fall 2025/LLM/Assignment2/src/rag_wikipedia_mini.db


In [5]:
# 4. Retrieval: Persona + mpnet + k=5
contexts_best = []
for q in tqdm(df["question"].astype(str).tolist(), desc="Retrieving k=5 contexts"):
    q_emb = embedding_model.encode(q).tolist()
    results = client.search(
        collection_name="rag_mini",
        data=[q_emb],
        anns_field="embedding",
        search_params={"metric_type": "COSINE", "params": {"nprobe": 10}}, 
        limit=5,
        output_fields=["passage"]
    )
    passages = [hit["entity"]["passage"] for hit in results[0]]
    ctx = "\n---\n".join(passages)
    contexts_best.append(ctx)

print("Retrieval finished. Total contexts generated:", len(contexts_best))

I0000 00:00:1759174708.599419 11182255 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers
Retrieving k=5 contexts: 100%|████████████████| 120/120 [00:07<00:00, 15.89it/s]

Retrieval finished. Total contexts generated: 120





In [6]:
# 5. Add new column to DataFrame
col_name = "contexts_persona_k5_all-mpnet-base-v2"
df[col_name] = contexts_best

In [13]:
# 6. Save file
df.to_csv(OUTPUT_PATH, index=False)
print(f"\nFile saved to: {os.path.abspath(OUTPUT_PATH)}")
print("New column added:", col_name)


File saved to: /Users/connie/Desktop/Fall 2025/LLM/Assignment2/data/evaluation/all_experiment_generations_for_ragas.csv
New column added: contexts_persona_k5_all-mpnet-base-v2
