# MS MARCO v2.1 + Redis Vector Search

Query examples against the ingested MS MARCO passages:
- **Vector Search** - Semantic similarity using embeddings
- **Full-Text Search** - Traditional keyword search
- **Hybrid Search** - Combine vector + text filters

**⚠️ Choose your embedding mode below based on how you ingested the data.**

In [1]:
import os
from datasets import load_dataset
from redisvl.index import SearchIndex
from redisvl.query import VectorQuery, FilterQuery
from redisvl.query.filter import Tag, Text
from redisvl.schema import IndexSchema
import redis
import numpy as np

REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379")

  from .autonotebook import tqdm as notebook_tqdm


---
## ⚙️ Configuration: Choose Embedding Mode

**Run ONE of the two cells below** based on how you ingested the data:
- `--use-cohere` → Run the Cohere cell
- `--use-hf` → Run the HuggingFace cell

In [2]:
# ============================================================
# OPTION A: COHERE EMBEDDINGS (if you used --use-cohere)
# ============================================================

EMBEDDING_MODE = "cohere"
VECTOR_DIMS = 1024

# Load queries with pre-computed Cohere embeddings
queries_ds = load_dataset("Cohere/msmarco-v2.1-embed-english-v3", "queries")
sample_queries = queries_ds['test'].select(range(20))

def get_query_embedding(query_idx: int) -> list:
    """Get pre-computed embedding from dataset."""
    return sample_queries[query_idx]['emb']

def get_query_text(query_idx: int) -> str:
    """Get query text from dataset."""
    return sample_queries[query_idx]['text']

print(f"✅ Using COHERE embeddings ({VECTOR_DIMS}-dim)")
print(f"   Loaded {len(sample_queries)} sample queries")
print(f"\nSample queries:")
for i in range(5):
    print(f"   [{i}] {sample_queries[i]['text']}")

✅ Using COHERE embeddings (1024-dim)
   Loaded 20 sample queries

Sample queries:
   [0] what is produced by muscle
   [1] who recorded be my baby
   [2] who said no one can make you feel inferior
   [3] what is ptf
   [4] At about what age do adults normally begin to lose bone mass?


In [None]:
# ============================================================
# OPTION B: HUGGINGFACE EMBEDDINGS (if you used --use-hf)
# ============================================================

EMBEDDING_MODE = "huggingface"
HF_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Must match ingest model

from sentence_transformers import SentenceTransformer
hf_model = SentenceTransformer(HF_MODEL)
VECTOR_DIMS = hf_model.get_sentence_embedding_dimension()

# Load queries (text only, we'll embed them)
queries_ds = load_dataset("Cohere/msmarco-v2.1-embed-english-v3", "queries")
sample_queries = queries_ds['test'].select(range(20))

# Pre-compute embeddings for sample queries
_query_embeddings_cache = {}

def get_query_embedding(query_idx: int) -> list:
    """Get embedding for a query by index (with caching)."""
    if query_idx not in _query_embeddings_cache:
        text = sample_queries[query_idx]['text']
        emb = hf_model.encode(text)
        _query_embeddings_cache[query_idx] = emb.tolist()
    return _query_embeddings_cache[query_idx]

def get_query_text(query_idx: int) -> str:
    """Get query text from dataset."""
    return sample_queries[query_idx]['text']

print(f"✅ Using HUGGINGFACE embeddings ({VECTOR_DIMS}-dim)")
print(f"   Model: {HF_MODEL}")
print(f"   Loaded {len(sample_queries)} sample queries")
print(f"\nSample queries:")
for i in range(5):
    print(f"   [{i}] {sample_queries[i]['text']}")

---
## Connect to Redis Index

In [3]:
# Schema must match what was used during ingestion
schema = {
    "index": {
        "name": "msmarco",
        "prefix": "doc",
        "storage_type": "hash",
    },
    "fields": [
        {"name": "docid", "type": "tag", "attrs": {"sortable": True}},
        {"name": "title", "type": "text"},
        {"name": "segment", "type": "text"},
        {"name": "url", "type": "tag", "attrs": {"sortable": True}},
        {
            "name": "text_embedding",
            "type": "vector",
            "attrs": {
                "algorithm": "hnsw",
                "dims": VECTOR_DIMS,
                "distance_metric": "cosine",
                "datatype": "float32",
            },
        },
    ],
}

redis_client = redis.from_url(REDIS_URL)
index = SearchIndex(IndexSchema.from_dict(schema), redis_client)

info = index.info()
print(f"Connected to index: {schema['index']['name']}")
print(f"Documents indexed: {info.get('num_docs', 'N/A')}")
print(f"Vector dimensions: {VECTOR_DIMS}")

Connected to index: msmarco
Documents indexed: 100000
Vector dimensions: 1024


---
## 1. Vector Search (Semantic)

Find passages with similar meaning using embeddings.

In [4]:
def vector_search(query_embedding: list, num_results: int = 5):
    """Search using vector similarity."""
    vq = VectorQuery(
        vector=query_embedding,
        vector_field_name="text_embedding",
        return_fields=["docid", "title", "segment", "url"],
        num_results=num_results,
    )
    return index.query(vq)

def print_results(results, query_text: str = None):
    """Pretty print search results."""
    if query_text:
        print(f"Query: {query_text}\n")
    print("=" * 70)
    
    if not results:
        print("No results found.")
        return
    
    for i, r in enumerate(results, 1):
        score = 1 - float(r.get('vector_distance', 0))  # cosine similarity
        print(f"\n[{i}] Score: {score:.3f}")
        print(f"    Title: {r['title'][:80]}")
        print(f"    Segment: {r['segment'][:150]}...")
        print(f"    URL: {r['url']}")

In [5]:
# Vector search with a sample query
query_idx = 0
results = vector_search(get_query_embedding(query_idx))
print_results(results, get_query_text(query_idx))

Query: what is produced by muscle


[1] Score: 0.545
    Title: AnPhystemplate
    Segment: important in producing graded muscle force. Multi-unit muscle - A smooth muscle in which individual muscle fibers contract only when they receive exci...
    URL: http://academics.smcvt.edu/dfacey/animalphysiology/Muscles/AnPhystemplate.htm

[2] Score: 0.531
    Title: AnPhystemplate
    Segment: Single-unit muscle - A smooth muscle in which individual fibers are coupled through gap junctions, allowing excitation to spread through the muscle in...
    URL: http://academics.smcvt.edu/dfacey/animalphysiology/Muscles/AnPhystemplate.htm

[3] Score: 0.529
    Title: AnPhystemplate
    Segment: Intercalated disk - The junctional region between two connected cardiac muscle cells. Kreb’s Cycle- The metabolic cycle responsible for the complete o...
    URL: http://academics.smcvt.edu/dfacey/animalphysiology/Muscles/AnPhystemplate.htm

[4] Score: 0.508
    Title: AnPhystemplate
    Segment: This pattern c

In [7]:
# Try another query
query_idx = 3
results = vector_search(get_query_embedding(query_idx))
print_results(results, get_query_text(query_idx))

Query: what is ptf


[1] Score: 0.333
    Title: Professional Teaching Knowledge (PTK) Study Plan - American Board | ABCTE
    Segment: http://www.amazon.com/exec/obidos/ASIN/0764524798/ref=nosim/americanboard-20
Your Essay – Start...
    URL: http://abcte.org/professional-teaching-knowledge-study-plan/

[2] Score: 0.310
    Title: Professional Teaching Knowledge (PTK) Study Plan - American Board | ABCTE
    Segment: PTK Study Areas Broken Down: Instructional Delivery
Communicating effectively
Presents clear and focused instruction
Effective questioning techniques
...
    URL: http://abcte.org/professional-teaching-knowledge-study-plan/

[3] Score: 0.301
    Title: Professional Teaching Knowledge (PTK) Study Plan - American Board | ABCTE
    Segment: http://www.amazon.com/exec/obidos/ASIN/0675210046/ref%3Dnosim/americanboard-20
Effective teaching methods: Research based practice: http://www.amazon....
    URL: http://abcte.org/professional-teaching-knowledge-study-plan/

[4] Score: 0.3

In [None]:
# Try more queries
for idx in [5, 7, 10]:
    print(f"\n{'#' * 70}\n")
    results = vector_search(get_query_embedding(idx), num_results=3)
    print_results(results, get_query_text(idx))

---
## 2. Full-Text Search (Keyword)

Traditional keyword search using Redis's full-text search capabilities.

In [8]:
def fulltext_search(query_text: str, num_results: int = 5):
    """Search using full-text matching on segment field."""
    text_filter = Text("segment") % query_text
    
    fq = FilterQuery(
        filter_expression=text_filter,
        return_fields=["docid", "title", "segment", "url"],
        num_results=num_results,
    )
    return index.query(fq)

def print_fulltext_results(results, query_text: str):
    """Pretty print full-text search results."""
    print(f"Full-text search: '{query_text}'\n")
    print("=" * 70)
    
    if not results:
        print("No results found.")
        return
    
    for i, r in enumerate(results, 1):
        print(f"\n[{i}] {r['title'][:80]}")
        print(f"    Segment: {r['segment'][:150]}...")
        print(f"    URL: {r['url']}")

In [9]:
# Full-text search examples
search_text = "muscle protein"
results = fulltext_search(search_text)
print_fulltext_results(results, search_text)

Full-text search: 'muscle protein'


[1] Top 10 Ways to Lose 20 Pounds - ABC News
    Segment: Milk and other dairy products can help dieters slim down and beef up, say Canadian researchers. Their study found that heavy people who exercised ever...
    URL: http://abcnews.go.com/Health/Wellness/top-10-ways-lose-20-pounds/story?id=18181846

[2] Protein Rich Diet for Weight Gain, What Foods to Eat to Gain Weight and Build Mu
    Segment: High Protein Foods for Muscle Building? Many fellas believe high protein foods are just for muscle building (Your love for biceps and triceps indeed!!...
    URL: http://accumass.com/blog/protein-rich-diet-for-weight-gain/

[3] Top 10 Ways to Lose 20 Pounds - ABC News
    Segment: 6category: Ways to Lose 20 Poundstitle: Whey to Loseurl: http://abcnews.go.com/Health/Wellness/top-10-ways-lose-20-pounds/story?id=18181846text: To dr...
    URL: http://abcnews.go.com/Health/Wellness/top-10-ways-lose-20-pounds/story?id=18181846

[4] Protein Rich Diet for Weigh

In [None]:
# Another full-text search
search_text = "acceleration speed"
results = fulltext_search(search_text)
print_fulltext_results(results, search_text)

---
## 3. Hybrid Search (Vector + Filters)

Combine semantic vector search with text or metadata filters.

In [None]:
def hybrid_search(
    query_embedding: list,
    text_filter: str = None,
    url_filter: str = None,
    num_results: int = 5,
):
    """Vector search with optional text/URL filters."""
    
    # Build filter expression
    filters = []
    if text_filter:
        filters.append(Text("segment") % text_filter)
    if url_filter:
        filters.append(Tag("url") == url_filter)
    
    # Combine filters with AND
    filter_expr = None
    if filters:
        filter_expr = filters[0]
        for f in filters[1:]:
            filter_expr = filter_expr & f
    
    vq = VectorQuery(
        vector=query_embedding,
        vector_field_name="text_embedding",
        return_fields=["docid", "title", "segment", "url"],
        num_results=num_results,
        filter_expression=filter_expr,
    )
    return index.query(vq)

In [None]:
# Hybrid: Vector search + text filter
query_idx = 0
text_constraint = "protein"

print(f"Query: {get_query_text(query_idx)}")
print(f"Filter: segment must contain '{text_constraint}'\n")
print("=" * 70)

results = hybrid_search(get_query_embedding(query_idx), text_filter=text_constraint)

for i, r in enumerate(results, 1):
    score = 1 - float(r.get('vector_distance', 0))
    print(f"\n[{i}] Score: {score:.3f}")
    print(f"    Title: {r['title'][:80]}")
    print(f"    Segment: {r['segment'][:150]}...")

In [None]:
# Another hybrid example
query_idx = 5
text_constraint = "energy"

print(f"Query: {get_query_text(query_idx)}")
print(f"Filter: segment must contain '{text_constraint}'\n")
print("=" * 70)

results = hybrid_search(get_query_embedding(query_idx), text_filter=text_constraint)

for i, r in enumerate(results, 1):
    score = 1 - float(r.get('vector_distance', 0))
    print(f"\n[{i}] Score: {score:.3f}")
    print(f"    Title: {r['title'][:80]}")
    print(f"    Segment: {r['segment'][:150]}...")

---
## 4. Batch Query Evaluation

In [10]:
import time

num_queries = 20  # Limited to sample_queries size
print(f"Running {num_queries} queries...\n")

latencies = []
for i in range(num_queries):
    emb = get_query_embedding(i)
    
    start = time.perf_counter()
    results = vector_search(emb, num_results=5)
    latency = (time.perf_counter() - start) * 1000
    latencies.append(latency)

avg_latency = np.mean(latencies)
p95_latency = np.percentile(latencies, 95)
qps = 1000 / avg_latency

print(f"Results:")
print(f"   Queries:     {num_queries}")
print(f"   Avg latency: {avg_latency:.1f}ms")
print(f"   P95 latency: {p95_latency:.1f}ms")
print(f"   QPS:         {qps:.1f} queries/sec")

Running 20 queries...

Results:
   Queries:     20
   Avg latency: 5.8ms
   P95 latency: 10.4ms
   QPS:         171.9 queries/sec
