In [3]:
%pip install --quiet -U openai redis requests bs4 feedparser numpy

Note: you may need to restart the kernel to use updated packages.


In [None]:
%docker run -d --name redis-stack-server -p 6379:6379 redis/redis-stack-server:latest

In [18]:
import feedparser
import numpy as np
from openai import AzureOpenAI
import redis
from redis.commands.search.field import VectorField, TextField
from redis.commands.search.query import Query
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
import os
from dotenv import load_dotenv

load_dotenv()

# Redis connection details
redis_host = os.getenv('REDIS_HOST')
redis_port = os.getenv('REDIS_PORT')
redis_password = os.getenv('REDIS_PASSWORD')

# Connect to the Redis server
conn = redis.Redis(host=os.getenv('REDIS_HOST'), 
                   port=os.getenv('REDIS_PORT'), 
                   password=os.getenv('REDIS_PASSWORD'), 
                   encoding='utf-8', 
                   decode_responses=True)

client = AzureOpenAI(
    api_key = os.getenv("AZURE_OPENAI_KEY"),  
    api_version=os.getenv("OPENAI_API_VERSION"),
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
)

SCHEMA = [
    TextField("url"),
    VectorField("embedding", "HNSW", {"TYPE": "FLOAT32", "DIM": 1536, "DISTANCE_METRIC": "COSINE"}),
]

# Create the index
try:
    conn.ft("posts").create_index(fields=SCHEMA, definition=IndexDefinition(prefix=["post:"], index_type=IndexType.HASH))
except Exception as e:
    print("Index already exists")

# URL of the RSS feed to parse
url = 'https://devblogs.microsoft.com/landingpage/'

# Parse the RSS feed with feedparser
feed = feedparser.parse(url)

# get number of entries in feed
entries = len(feed.entries)
print("Number of entries: ", entries)

p = conn.pipeline(transaction=False)
for i, entry in enumerate(feed.entries[:50]):
    # report progress
    print("Create embedding and save for entry ", i, " of ", entries)
    
    article = entry.description

    embedding = client.embeddings.create(
        input=article,
        model="text-embedding-ada-002"
    )

    # print the embedding (length = 1536)
    vector = embedding.data[0].embedding

    # convert to numpy array
    vector = np.array(vector).astype(np.float32).tobytes()

    # Create a new hash with the URL and embedding
    post_hash = {
        "url": entry.link,
        "embedding": vector
    }
    
    # add_document() is deprecated
    conn.hset(name=f"post:{i}", mapping=post_hash)

p.execute()

print("Vector upload complete.")

Number of entries:  10
Create embedding and save for entry  0  of  10
Create embedding and save for entry  1  of  10
Create embedding and save for entry  2  of  10
Create embedding and save for entry  3  of  10
Create embedding and save for entry  4  of  10
Create embedding and save for entry  5  of  10
Create embedding and save for entry  6  of  10
Create embedding and save for entry  7  of  10
Create embedding and save for entry  8  of  10
Create embedding and save for entry  9  of  10
Vector upload complete.


In [19]:
def search_vectors(query_vector, client, top_k=5):
    base_query = "*=>[KNN 5 @embedding $vector AS vector_score]"
    query = Query(base_query).return_fields("url", "vector_score").sort_by("vector_score").dialect(2)    

    try:
        results = client.ft("posts").search(query, query_params={"vector": query_vector})
    except Exception as e:
        print("Error calling Redis search: ", e)
        return None

    return results


if conn.ping():
    print("Connected to Redis")

# Enter a query
query = "Microsoft"

# Vectorize the query using OpenAI's text-embedding-ada-002 model
print("Vectorizing query...")
embedding = client.embeddings.create(input=query, model="text-embedding-ada-002")
query_vector = embedding.data[0].embedding

# Convert the vector to a numpy array
query_vector = np.array(query_vector).astype(np.float32).tobytes()

# Perform the similarity search
print("Searching for similar posts...")
results = search_vectors(query_vector, conn)

if results:
    print(f"Found {results.total} results:")
    for i, post in enumerate(results.docs):
        score = 1 - float(post.vector_score)
        print(f"\t{i}. {post.url} (Score: {round(score ,3) })")
else:
    print("No results found")

Connected to Redis
Vectorizing query...
Searching for similar posts...
Found 5 results:
	0. https://devblogs.microsoft.com/identity/eng-connect-jun-24 (Score: 0.825)
	1. https://devblogs.microsoft.com/visualstudio/automatically-install-visual-studio-security-updates-through-microsoft-update (Score: 0.816)
	2. https://devblogs.microsoft.com/directx/step-forward-for-gaming-on-arm-devices-2024 (Score: 0.81)
	3. https://devblogs.microsoft.com/qsharp/evaluating-cat-qubits-for-fault-tolerant-quantum-computing-using-azure-quantum-resource-estimator (Score: 0.796)
	4. https://devblogs.microsoft.com/ise/empowering-collaboration-with-tech-savvy-customer (Score: 0.785)
