# =========================================================
# Evaluation Script
# =========================================================
# Compares vector-only, keyword-only, and hybrid search performance
# using Recall@3 across a set of test queries.

# Author: Carlos Lao
# Refactored for clarity, documentation, and structured logging style
# =========================================================


# ---------------------------------------------------------
# Step 1: Import modules
# ---------------------------------------------------------

In [1]:
import pandas as pd
from app.config import logger
from app.retrieval import hybrid_search, keyword_search_fts5, semantic_search

import sys
print(sys.executable)

2025-08-25 01:21:15,471 [INFO] Loading faiss with AVX2 support.
2025-08-25 01:21:15,557 [INFO] Successfully loaded faiss with AVX2 support.
2025-08-25 01:23:08,133 [INFO] Use pytorch device_name: cuda:0
2025-08-25 01:23:08,134 [INFO] Load pretrained SentenceTransformer: all-MiniLM-L6-v2


/mnt/c/Projects/MLE Courses/CarlosLao-homework/vllm-env/bin/python


# ---------------------------------------------------------
# Step 2: Define evaluation queries and ground truth
# ---------------------------------------------------------

In [2]:
# Each query maps to the expected relevant chunk ID(s)
evaluation_queries = {
    "transformer architecture": ["doc123_chunk1"],
    "language modeling": ["doc123_chunk2"],
    "encoder-decoder": ["doc123_chunk3"],
    "attention mechanism": ["doc123_chunk1"],
    "sequence processing": ["doc123_chunk1"],
    "translation tasks": ["doc123_chunk3"],
    "summarization": ["doc123_chunk3"],
    "self-attention": ["doc123_chunk1"],
    "NLP applications": ["doc123_chunk2"],
    "deep learning": ["doc123_chunk2"]
}

logger.info("Loaded evaluation queries.")

2025-08-25 01:23:50,109 [INFO] Loaded evaluation queries.


# ---------------------------------------------------------
# Step 3: Define Recall@k function
# ---------------------------------------------------------

In [3]:
def recall_at_k(results, relevant_ids, k=3):
    """
    Computes Recall@k for a given result list and ground truth.

    Parameters
    ----------
    results : list of tuple
        Retrieved chunk IDs and scores.
    relevant_ids : list of str
        Known relevant chunk IDs.
    k : int
        Number of top results to evaluate.

    Returns
    -------
    float
        Recall score between 0 and 1.
    """
    # Extract top-k chunk IDs from results
    top_k = [r[0] for r in results[:k]]

    # Count how many relevant IDs appear in top-k
    return sum(1 for r in top_k if r in relevant_ids) / len(relevant_ids)

# ---------------------------------------------------------
# Step 4: Run evaluation loop
# ---------------------------------------------------------

In [4]:
# Initialize summary table to store scores
summary = {
    "Query": [],
    "Vector Recall@3": [],
    "Keyword Recall@3": [],
    "Hybrid Recall@3": []
}

# Evaluate each query across all three methods
for query, relevant in evaluation_queries.items():
    logger.info(f"Evaluating query: '{query}'")

    # Run each search method
    vec = semantic_search(query)
    kw = keyword_search_fts5(query)
    hyb = hybrid_search(query)

    # Compute recall scores
    summary["Query"].append(query)
    summary["Vector Recall@3"].append(recall_at_k(vec, relevant))
    summary["Keyword Recall@3"].append(recall_at_k(kw, relevant))
    summary["Hybrid Recall@3"].append(recall_at_k(hyb, relevant))

2025-08-25 01:23:59,947 [INFO] Evaluating query: 'transformer architecture'
2025-08-25 01:23:59,949 [INFO] Semantic search initiated for query: 'transformer architecture'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)
2025-08-25 01:24:00,545 [INFO] Keyword search (FTS5) initiated for query: 'transformer architecture'
2025-08-25 01:24:00,569 [INFO] Hybrid search initiated for query: 'transformer architecture'
2025-08-25 01:24:00,571 [INFO] Semantic search initiated for query: 'transformer architecture'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:00,615 [INFO] Keyword search (FTS5) initiated for query: 'transformer architecture'
2025-08-25 01:24:00,633 [INFO] Hybrid search completed with 3 results.
2025-08-25 01:24:00,636 [INFO] Evaluating query: 'language modeling'
2025-08-25 01:24:00,638 [INFO] Semantic search initiated for query: 'language modeling'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:00,669 [INFO] Keyword search (FTS5) initiated for query: 'language modeling'
2025-08-25 01:24:00,686 [INFO] Hybrid search initiated for query: 'language modeling'
2025-08-25 01:24:00,687 [INFO] Semantic search initiated for query: 'language modeling'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:00,717 [INFO] Keyword search (FTS5) initiated for query: 'language modeling'
2025-08-25 01:24:00,738 [INFO] Hybrid search completed with 3 results.
2025-08-25 01:24:00,738 [INFO] Evaluating query: 'encoder-decoder'
2025-08-25 01:24:00,739 [INFO] Semantic search initiated for query: 'encoder-decoder'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:00,774 [INFO] Keyword search (FTS5) initiated for query: 'encoder-decoder'
2025-08-25 01:24:00,796 [INFO] Hybrid search initiated for query: 'encoder-decoder'
2025-08-25 01:24:00,797 [INFO] Semantic search initiated for query: 'encoder-decoder'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:00,824 [INFO] Keyword search (FTS5) initiated for query: 'encoder-decoder'
2025-08-25 01:24:00,843 [INFO] Hybrid search completed with 3 results.
2025-08-25 01:24:00,844 [INFO] Evaluating query: 'attention mechanism'
2025-08-25 01:24:00,845 [INFO] Semantic search initiated for query: 'attention mechanism'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:00,876 [INFO] Keyword search (FTS5) initiated for query: 'attention mechanism'
2025-08-25 01:24:00,890 [INFO] Hybrid search initiated for query: 'attention mechanism'
2025-08-25 01:24:00,891 [INFO] Semantic search initiated for query: 'attention mechanism'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:00,917 [INFO] Keyword search (FTS5) initiated for query: 'attention mechanism'
2025-08-25 01:24:00,933 [INFO] Hybrid search completed with 3 results.
2025-08-25 01:24:00,934 [INFO] Evaluating query: 'sequence processing'
2025-08-25 01:24:00,934 [INFO] Semantic search initiated for query: 'sequence processing'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:00,974 [INFO] Keyword search (FTS5) initiated for query: 'sequence processing'
2025-08-25 01:24:00,995 [INFO] Hybrid search initiated for query: 'sequence processing'
2025-08-25 01:24:00,996 [INFO] Semantic search initiated for query: 'sequence processing'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:01,022 [INFO] Keyword search (FTS5) initiated for query: 'sequence processing'
2025-08-25 01:24:01,041 [INFO] Hybrid search completed with 3 results.
2025-08-25 01:24:01,042 [INFO] Evaluating query: 'translation tasks'
2025-08-25 01:24:01,043 [INFO] Semantic search initiated for query: 'translation tasks'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:01,072 [INFO] Keyword search (FTS5) initiated for query: 'translation tasks'
2025-08-25 01:24:01,088 [INFO] Hybrid search initiated for query: 'translation tasks'
2025-08-25 01:24:01,089 [INFO] Semantic search initiated for query: 'translation tasks'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:01,117 [INFO] Keyword search (FTS5) initiated for query: 'translation tasks'
2025-08-25 01:24:01,133 [INFO] Hybrid search completed with 3 results.
2025-08-25 01:24:01,133 [INFO] Evaluating query: 'summarization'
2025-08-25 01:24:01,134 [INFO] Semantic search initiated for query: 'summarization'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:01,162 [INFO] Keyword search (FTS5) initiated for query: 'summarization'
2025-08-25 01:24:01,179 [INFO] Hybrid search initiated for query: 'summarization'
2025-08-25 01:24:01,179 [INFO] Semantic search initiated for query: 'summarization'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:01,209 [INFO] Keyword search (FTS5) initiated for query: 'summarization'
2025-08-25 01:24:01,237 [INFO] Hybrid search completed with 3 results.
2025-08-25 01:24:01,238 [INFO] Evaluating query: 'self-attention'
2025-08-25 01:24:01,240 [INFO] Semantic search initiated for query: 'self-attention'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:01,266 [INFO] Keyword search (FTS5) initiated for query: 'self-attention'
2025-08-25 01:24:01,287 [INFO] Hybrid search initiated for query: 'self-attention'
2025-08-25 01:24:01,288 [INFO] Semantic search initiated for query: 'self-attention'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:01,311 [INFO] Keyword search (FTS5) initiated for query: 'self-attention'
2025-08-25 01:24:01,330 [INFO] Hybrid search completed with 3 results.
2025-08-25 01:24:01,331 [INFO] Evaluating query: 'NLP applications'
2025-08-25 01:24:01,331 [INFO] Semantic search initiated for query: 'NLP applications'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:01,364 [INFO] Keyword search (FTS5) initiated for query: 'NLP applications'
2025-08-25 01:24:01,378 [INFO] Hybrid search initiated for query: 'NLP applications'
2025-08-25 01:24:01,379 [INFO] Semantic search initiated for query: 'NLP applications'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:01,409 [INFO] Keyword search (FTS5) initiated for query: 'NLP applications'
2025-08-25 01:24:01,428 [INFO] Hybrid search completed with 3 results.
2025-08-25 01:24:01,429 [INFO] Evaluating query: 'deep learning'
2025-08-25 01:24:01,429 [INFO] Semantic search initiated for query: 'deep learning'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:01,460 [INFO] Keyword search (FTS5) initiated for query: 'deep learning'
2025-08-25 01:24:01,484 [INFO] Hybrid search initiated for query: 'deep learning'
2025-08-25 01:24:01,485 [INFO] Semantic search initiated for query: 'deep learning'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-25 01:24:01,511 [INFO] Keyword search (FTS5) initiated for query: 'deep learning'
2025-08-25 01:24:01,529 [INFO] Hybrid search completed with 3 results.


# ---------------------------------------------------------
# Step 5: Display results
# ---------------------------------------------------------

In [9]:
# Convert the summary dictionary into a pandas DataFrame
df = pd.DataFrame(summary)

# Identify which method had the highest Recall@3 for each query
df["Best Method"] = df[["Vector Recall@3", "Keyword Recall@3", "Hybrid Recall@3"]].idxmax(axis=1)

# Define numeric columns for formatting and highlighting
numeric_cols = ["Vector Recall@3", "Keyword Recall@3", "Hybrid Recall@3"]

# Log that evaluation is complete
logger.info("Evaluation complete. Displaying results.")

# Apply formatting only to numeric columns
try:
    styled = df.style.format({col: "{:.2f}" for col in numeric_cols}) \
                     .highlight_max(subset=numeric_cols, axis=1)
    display(styled)
except Exception as e:
    logger.warning(f"Styling failed: {e}")
    print(df)

2025-08-25 01:28:45,514 [INFO] Evaluation complete. Displaying results.


Unnamed: 0,Query,Vector Recall@3,Keyword Recall@3,Hybrid Recall@3,Best Method
0,transformer architecture,1.0,0.0,1.0,Vector Recall@3
1,language modeling,1.0,1.0,1.0,Vector Recall@3
2,encoder-decoder,1.0,1.0,1.0,Vector Recall@3
3,attention mechanism,1.0,0.0,1.0,Vector Recall@3
4,sequence processing,1.0,0.0,1.0,Vector Recall@3
5,translation tasks,1.0,0.0,1.0,Vector Recall@3
6,summarization,1.0,1.0,1.0,Vector Recall@3
7,self-attention,1.0,1.0,1.0,Vector Recall@3
8,NLP applications,1.0,0.0,1.0,Vector Recall@3
9,deep learning,1.0,0.0,1.0,Vector Recall@3


# ---------------------------------------------------------
# Step 6: Print average scores
# ---------------------------------------------------------

In [10]:
# Print average recall scores across all queries
print("Average Recall@3:")
print("Vector-only:", round(df["Vector Recall@3"].mean(), 3))
print("Keyword-only:", round(df["Keyword Recall@3"].mean(), 3))
print("Hybrid:", round(df["Hybrid Recall@3"].mean(), 3))

Average Recall@3:
Vector-only: 1.0
Keyword-only: 0.4
Hybrid: 1.0
