# MS MARCO + Redis Vector Search

Sample queries against the ingested MS MARCO passages.

In [None]:
Use this dataset instead: https://huggingface.co/datasets/Cohere/msmarco-v2.1-embed-english-v3
In the notebook can you load the queries sample partition of the dataset above ^
Can you add a sample query for full text search and then for hybrid as well?


In [None]:
from datasets import load_dataset

ds = load_dataset(
    "Cohere/msmarco-v2.1-embed-english-v3", 
    "passages",
    columns=["docid", "text", "emb"],  # only these fields
).take(10000)

Downloading data:   0%|          | 0/60 [06:00<?, ?files/s]
Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [1]:
import os
from sentence_transformers import SentenceTransformer
from redisvl.index import SearchIndex
from redisvl.query import VectorQuery

REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379")
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# Load model and connect to index
model = SentenceTransformer(MODEL_NAME)
index = SearchIndex.from_existing("msmarco", redis_url=REDIS_URL)

print(f"Connected to index with {index.info()['num_docs']} documents")

  from .autonotebook import tqdm as notebook_tqdm


13:19:22 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps
13:19:22 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
Connected to index with 100 documents


In [4]:
def search(query: str, k: int = 5):
    """Search for similar passages."""
    embedding = model.encode(query).tolist()
    
    vq = VectorQuery(
        vector=embedding,
        vector_field_name="text_embedding",
        return_fields=["text", "url", "passage_id"],
        num_results=k,
    )
    
    results = index.query(vq)
    
    print(f"Query: {query}\n")
    print("=" * 60)
    for i, r in enumerate(results, 1):
        score = 1 - float(r.get("vector_distance", 0))  # cosine similarity
        print(f"\n[{i}] Score: {score:.3f}")
        print(f"    {r['text'][:200]}...")
        print(f"    Source: {r['url']}")
    
    return results

## Example Queries

In [5]:
# Factual question
results = search("What is the capital of Australia?")

Batches: 100%|██████████| 1/1 [00:00<00:00,  3.54it/s]

Query: What is the capital of Australia?


[1] Score: 0.599
    Sydney is the capital city of the Australian state of New South Wales, and Australia's largest city. A week in Sydney will help you see many of the sights of Sydney and its surrounds, and understand t...
    Source: http://wikitravel.org/en/One_week_in_Sydney

[2] Score: 0.543
    Sydney, New South Wales, Australia is located in a coastal basin bordered by the Pacific Ocean to the east, the Blue Mountains to the west, the Hawkesbury River to the north and the Woronora Plateau t...
    Source: https://en.wikipedia.org/wiki/Geography_of_Sydney

[3] Score: 0.436
    The Sydney central business district, Sydney harbour and outer suburbs from the West. North Sydney 's commercial district. The extensive area covered by urban Sydney is formally divided into more than...
    Source: https://en.wikipedia.org/wiki/Geography_of_Sydney

[4] Score: 0.424
    Sydney lies on a submergent coastline, where the ocean level has risen to floo




In [None]:
# Technical question
results = search("How does machine learning work?")

In [None]:
# Definition query
results = search("What is results-based accountability?")

## Hybrid Search (Vector + Text Filter)

In [9]:
from redisvl.query.filter import Tag

def search_with_filter(query: str, url_contains: str, k: int = 5):
    """Search with URL filter."""
    embedding = model.encode(query).tolist()
    
    # Filter to specific domain
    url_filter = Tag("url") == f"*{url_contains}*"
    
    vq = VectorQuery(
        vector=embedding,
        vector_field_name="text_embedding",
        return_fields=["text", "url", "passage_id"],
        num_results=k,
        filter_expression=url_filter,
    )
    
    results = index.query(vq)
    
    print(f"Query: {query}")
    print(f"Filter: URL contains '{url_contains}'\n")
    print("=" * 60)
    for i, r in enumerate(results, 1):
        score = 1 - float(r.get("vector_distance", 0))
        print(f"\n[{i}] Score: {score:.3f}")
        print(f"    {r['text'][:200]}...")
        print(f"    Source: {r['url']}")
    
    return results

In [10]:
# Search only Wikipedia sources
results = search_with_filter("central bank", "wikipedia")

Batches: 100%|██████████| 1/1 [00:00<00:00,  5.96it/s]

Query: central bank
Filter: URL contains 'wikipedia'




