In [3]:
# 🧼 Clean Start: Enron Governance RAG Recall Evaluation Notebook
# ----------------------------------
# This notebook walks through a clean, structured pipeline:
# - Chunking Enron emails
# - Embedding text
# - Indexing with FAISS
# - Evaluating recall against ground truth
# - Saving intermediate files for reuse

# ✅ Step 1: Setup & Imports
import os
import re
import json
import pickle
import numpy as np
from tqdm import tqdm

email_dir = "buy-r"
chunk_size = 512
chunk_overlap = 128
chunks = []
metadata = []

In [4]:
from google.colab import files
uploaded = files.upload()

Saving buy-r.zip to buy-r.zip
Saving ground-truth.json to ground-truth.json


In [5]:
import zipfile

with zipfile.ZipFile("buy-r.zip", 'r') as zip_ref:
    zip_ref.extractall("buy-r")

In [6]:

for root, _, files in os.walk(email_dir):
    for file in files:
        full_path = os.path.join(root, file)
        # Fix trailing dot issue
        filename = os.path.splitext(file)[0].rstrip(".")
        folder = os.path.relpath(root, start=".")
        folder_rel = os.path.relpath(root, start=email_dir)
        rel_path = f"{folder_rel}/{filename}"

        try:
            with open(full_path, 'r', errors='ignore') as f:
                content = f.read()
                body = re.split(r"\n\s*\n", content, maxsplit=1)
                if len(body) < 2:
                    continue
                body_text = body[1]

                for i in range(0, len(body_text), chunk_size):
                    chunk = body_text[i:i+chunk_size]
                    if chunk.strip():
                        chunks.append(chunk)
                        metadata.append({
                            "doc_id": rel_path,
                            "offset": i
                        })
        except Exception as e:
            print(f"Error in {rel_path}: {e}")

# Save for reuse
with open("chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)
with open("metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)

print(f"✅ Done. {len(chunks)} chunks, {len(metadata)} metadata entries.")


✅ Done. 6974 chunks, 6974 metadata entries.


In [7]:
with open("ground-truth.json", "r") as f:
    ground_truth_data = json.load(f)

ground_truth_relevant = {doc["doc_id"] for doc in ground_truth_data if doc["relevant"]}
ground_truth_nonrelevant = {doc["doc_id"] for doc in ground_truth_data if not doc["relevant"]}

print(f"🎯 Loaded {len(ground_truth_relevant)} relevant and {len(ground_truth_nonrelevant)} non-relevant ground truth entries.")

🎯 Loaded 27 relevant and 21 non-relevant ground truth entries.


In [8]:
# ✅ Sanity test #1
for m in metadata:
    if "1076" in m["doc_id"]:
        print(m["doc_id"])

buy-r/inbox/1076
buy-r/inbox/1076


In [9]:
# ✅ Sanity test #2
target = "buy-r/inbox/1076"
matching = [m for m in metadata if m["doc_id"] == target]
print(matching[0] if matching else "❌ Not found")

{'doc_id': 'buy-r/inbox/1076', 'offset': 0}


In [10]:
# ✅ Load model and embed all chunks
!pip install -q sentence-transformers faiss-cpu

from sentence_transformers import SentenceTransformer
import faiss

model = SentenceTransformer("BAAI/bge-base-en-v1.5") # QA-tuned model

print("🔄 Embedding all chunks...")
embeddings = model.encode(chunks, show_progress_bar=True, normalize_embeddings=True)

print("✅ Embedding complete. Shape:", embeddings.shape)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m98.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔄 Embedding all chunks...


Batches:   0%|          | 0/218 [00:00<?, ?it/s]

✅ Embedding complete. Shape: (6974, 768)


In [11]:
# Download a local version of the embeddings file, 23.3 MB

import numpy as np
import json
import zipfile
from google.colab import files

# Step 1: Save embeddings to a .npy file
np.save("embeddings.npy", embeddings)

# Step 2: Save your text chunks to a .jsonl file
with open("chunks.jsonl", "w") as f:
    for chunk in chunks:
        f.write(json.dumps({"text": chunk}) + "\n")

# Step 3: Zip the files
with zipfile.ZipFile("embeddings_bundle.zip", "w") as zipf:
    zipf.write("embeddings.npy")
    zipf.write("chunks.jsonl")

# Step 4: Trigger download to your local machine
files.download("embeddings_bundle.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
# ✅ Build FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

print(f"📚 FAISS index built with {index.ntotal} vectors.")

📚 FAISS index built with 6974 vectors.


In [13]:
# Optional: sample query method to test and measure recall prior to reranking
query = "What concerns were raised internally at Enron about LJM, Raptor transactions, or failures to disclose risk?"

top_k = 100

query_embedding = model.encode([query])
distances, indices = index.search(query_embedding, k=20)

retrieved_doc_ids = set()
for idx in indices[0]:
    doc_id = metadata[idx]["doc_id"]
    retrieved_doc_ids.add(doc_id)

# Re-run the eval
true_positives = retrieved_doc_ids & ground_truth_relevant
false_positives = retrieved_doc_ids & ground_truth_nonrelevant
false_negatives = ground_truth_relevant - retrieved_doc_ids

recall = len(true_positives) / len(ground_truth_relevant) if ground_truth_relevant else 0
precision = len(true_positives) / len(retrieved_doc_ids) if retrieved_doc_ids else 0
f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

print("--- Evaluation ---")
print(f"✅ Recall@{top_k}: {recall:.2f}")
print(f"✅ Precision@{top_k}: {precision:.2f}")
print(f"✅ F1 Score: {f1:.2f}")
print(f"🎯 Relevant documents retrieved: {sorted(true_positives)}")
print(f"🚫 Non-relevant documents retrieved: {sorted(false_positives)}")
print(f"📉 Missed relevant documents: {sorted(false_negatives)}")

--- Evaluation ---
✅ Recall@100: 0.15
✅ Precision@100: 0.20
✅ F1 Score: 0.17
🎯 Relevant documents retrieved: ['buy-r/inbox/1076', 'buy-r/inbox/1117', 'buy-r/inbox/431', 'buy-r/inbox/717']
🚫 Non-relevant documents retrieved: []
📉 Missed relevant documents: ['buy-r/discussion_threads/10', 'buy-r/discussion_threads/122', 'buy-r/inbox/1068', 'buy-r/inbox/1070', 'buy-r/inbox/1071', 'buy-r/inbox/1089', 'buy-r/inbox/1103', 'buy-r/inbox/1136', 'buy-r/inbox/1138', 'buy-r/inbox/142', 'buy-r/inbox/276', 'buy-r/inbox/570', 'buy-r/inbox/677', 'buy-r/inbox/814', 'buy-r/inbox/822', 'buy-r/sent/177', 'buy-r/sent_items/227', 'buy-r/sent_items/236', 'buy-r/sent_items/276', 'buy-r/sent_items/288', 'buy-r/sent_items/322', 'buy-r/sent_items/343', 'buy-r/sent_items/372']


In [14]:
# ✅ Step 8: Rerank FAISS top-k results using BGE-Reranker
!pip install -q transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

reranker_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-large")
reranker_model = AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-large")
reranker_model.eval()
if torch.cuda.is_available():
    reranker_model.cuda()
    print("🚀 Reranker using GPU")
else:
    print("⚠️ Reranker using CPU")


tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

⚠️ Reranker using CPU


In [None]:
# ✅ Optional: GPT-based Reranker
import openai
from openai import OpenAI
client = OpenAI(api_key=OPENAI-TOKEN)

def gpt_rerank(query, candidates, max_chunks=20):
    results = []
    for i, (chunk, doc_id) in enumerate(candidates[:max_chunks]):
        prompt = f"""
Question: {query}

Context:
{chunk}

Does this context directly help answer the question above? Respond only with Yes or No.
"""
        try:
            response = client.chat.completions.create(
                model="gpt-4",
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
                max_tokens=10,
            )
            answer = response.choices[0].message.content.strip().lower()
            if "yes" in answer:
                results.append((doc_id, chunk))
        except Exception as e:
            print(f"⚠️ GPT error on chunk {i} ({doc_id}): {e}")
    return results

def rerank_with_gpt(query, top_k_retrieve=100):
    print(f"🔍 GPT final reranking for query: '{query}'")
    query_embedding = model.encode([query], normalize_embeddings=True)
    _, indices = index.search(np.array(query_embedding).astype("float32"), k=top_k_retrieve)

    candidates = [(chunks[i], metadata[i]["doc_id"]) for i in indices[0]]
    gpt_results = gpt_rerank(query, candidates)

    retrieved_set = {doc_id for doc_id, _ in gpt_results}
    true_positives = retrieved_set & ground_truth_relevant
    false_positives = retrieved_set & ground_truth_nonrelevant
    false_negatives = ground_truth_relevant - retrieved_set

    recall = len(true_positives) / len(ground_truth_relevant) if ground_truth_relevant else 0
    precision = len(true_positives) / len(retrieved_set) if retrieved_set else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print("--- GPT RERANKED Evaluation ---")
    print(f"📈 Recall: {recall:.2f}")
    print(f"✅ Precision: {precision:.2f}")
    print(f"🎯 F1 Score: {f1:.2f}")
    print(f"✔️ Relevant documents retrieved: {sorted(true_positives)}")
    print(f"🚫 Non-relevant documents retrieved: {sorted(false_positives)}")
    print(f"📉 Missed relevant documents: {sorted(false_negatives)}")

def rerank_and_evaluate(query, top_k_retrieve=100, top_k_rerank=20):
    print(f"🔍 Reranking for query: '{query}'")
    query_embedding = model.encode([query], normalize_embeddings=True)
    _, indices = index.search(np.array(query_embedding).astype("float32"), k=top_k_retrieve)

    candidate_pairs = []
    candidate_ids = []
    for i in indices[0]:
        candidate_pairs.append((query, chunks[i]))
        candidate_ids.append(metadata[i]["doc_id"])

    batch_size = 32
    scores = []
    for start in range(0, len(candidate_pairs), batch_size):
        end = start + batch_size
        batch_pairs = candidate_pairs[start:end]
        reranker_inputs = reranker_tokenizer.batch_encode_plus(
            batch_pairs, padding=True, truncation=True, max_length=512, return_tensors="pt"
        )
        if torch.cuda.is_available():
            reranker_inputs = {k: v.cuda() for k, v in reranker_inputs.items()}
        with torch.no_grad():
            batch_scores = reranker_model(**reranker_inputs).logits.squeeze(-1)
        scores.extend(batch_scores.detach().cpu().tolist())

    reranked = sorted(zip(candidate_ids, scores), key=lambda x: x[1], reverse=True)
    reranked_doc_ids = [doc_id for doc_id, _ in reranked[:top_k_rerank]]

    retrieved_set = set(reranked_doc_ids)
    true_positives = retrieved_set & ground_truth_relevant
    false_positives = retrieved_set & ground_truth_nonrelevant
    false_negatives = ground_truth_relevant - retrieved_set

    recall = len(true_positives) / len(ground_truth_relevant) if ground_truth_relevant else 0
    precision = len(true_positives) / len(retrieved_set) if retrieved_set else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print("--- RERANKED Evaluation ---")
    print(f"📈 Recall@{top_k_rerank}: {recall:.2f}")
    print(f"✅ Precision@{top_k_rerank}: {precision:.2f}")
    print(f"🎯 F1 Score: {f1:.2f}")
    print(f"✔️ Relevant documents retrieved: {sorted(true_positives)}")
    print(f"🚫 Non-relevant documents retrieved: {sorted(false_positives)}")
    print(f"📉 Missed relevant documents: {sorted(false_negatives)}")
