## Import necessary modules

In [None]:
import redis, json, sys, os, logging
import numpy as np
from openai import OpenAI
import openai
import pandas as pd
from dotenv import load_dotenv
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query
from redis.commands.search.field import TextField, VectorField
from typing import List
from sshtunnel import SSHTunnelForwarder

In [2]:
# Note. alternatively you can set a temporary env variable like this:
os.environ["OPENAI_API_KEY"] = ''

if os.getenv("OPENAI_API_KEY") is not None:
    openai.api_key = os.getenv("OPENAI_API_KEY")
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

OPENAI_API_KEY is ready


### Server Connection

In [3]:
## Direct Connection
REDIS_HOST = ""
REDIS_PORT = 6379
REDIS_PASSWORD = ""  # default for passwordless Redis

# Connect to Redis
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD)
redis_client.ping()

True

In [10]:
redis_client.close()

In [11]:
from sshtunnel import SSHTunnelForwarder

In [12]:
# SSH server configuration
ssh_host = ''
ssh_username = ''
ssh_private_key = ''

# Redis server configuration on the remote
remote_host = ''
remote_port = 6379

# Local port to forward to
local_port = 6379

# Setting up the SSH tunnel
server = SSHTunnelForwarder(
    ssh_address_or_host=(ssh_host, 22),  # Port 22 is the default SSH port
    ssh_username=ssh_username,
    ssh_private_key=ssh_private_key,
    remote_bind_address=(remote_host, remote_port),
    local_bind_address=('', local_port)
)

server.start()
print("SSH tunnel established")

SSH tunnel established


In [13]:
client_dev = redis.Redis(host='', port=local_port, decode_responses=True)
client_dev.ping()

True

In [15]:
client_dev.close()

In [16]:
server.stop()

### Redis Functions

In [60]:
try:
    redis_client.ft("").dropindex(delete_documents=False)
    print("Index and associated data have been deleted.")
except Exception as e:
    print(f"An error occurred: {e}")

Index and associated data have been deleted.


In [None]:
client_dev = redis.Redis(encoding='latin-1', decode_responses=True)

# Search for documents
search_results = client_dev.execute_command('FT.SEARCH', '', '@:""', 'LIMIT', 0, 1000)

# Extract document IDs from search results
document_ids = [search_results[i] for i in range(1, len(search_results), 2)]

for doc_id in document_ids:
    client_dev.execute_command('FT.DEL', 'index', doc_id)

In [None]:
redis_client = redis.Redis(encoding='latin-1', decode_responses=True)

# Search for documents
search_results = redis_client.execute_command('FT.SEARCH', 'index', '@:""', 'LIMIT', 0, 1000)

### keyword-index

In [3]:
def document_embedding(document):
    client = OpenAI(api_key="")
    text = client.embeddings.create(input = [str(document)], model="text-embedding-3-small") 
    return text.data[0].embedding

test_name = "Detoxification Support"
test_emb = document_embedding(test_name)

In [45]:
# Constants
VECTOR_DIM = len(test_emb)  # length of the vectors
VECTOR_NUMBER = 10000 # initial number of vectors
INDEX_NAME = "index"  # name of the search index
PREFIX = ""  # prefix for the document keys
DISTANCE_METRIC = "COSINE"  # distance metric for the vectors (ex. COSINE, IP, L2)

In [46]:
# Define RediSearch fields for each of the columns in the dataset
category = TextField(name="category")
tag_name = TextField(name="tag_name")
keyword_id = TextField(name="keyword_id")
explanation = TextField(name="explanation")
ingredients = TextField(name="ingredients")
title_vector = VectorField(
    "title_vector",
    "FLAT",
    {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    },
)
content_vector = VectorField(
    "content_vector",
    "FLAT",
    {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    },
)

fields = [category, tag_name, keyword_id, explanation, ingredients, title_vector, content_vector]

In [47]:
# Check if index exists
try:
    client_dev.ft(INDEX_NAME).info()
    print("Index already exists")
except:
    # Create RediSearch Index
    client_dev.ft(INDEX_NAME).create_index(
        fields=fields,
        definition=IndexDefinition(prefix=[PREFIX], index_type=IndexType.HASH),
    )

In [48]:
def index_documents(client, prefix, documents):
    # records = documents.to_dict("records")
    for doc in documents:
        key = f"{prefix}:{str(doc['keyword_id'])}"

        # create byte vectors for title and content
        title_embedding = np.array(doc["title_vector"], dtype=np.float32).tobytes()
        content_embedding = np.array(doc["content_vector"], dtype=np.float32).tobytes()

        # replace list of floats with byte vectors
        doc["title_vector"] = title_embedding
        doc["content_vector"] = content_embedding

        client.hset(key, mapping=doc)

In [8]:
def search_redis(
    client_dev: redis.Redis,
    user_query: str,
    index_name: str = "index",
    vector_field: str = "title_vector",
    return_fields: list = [ "tag_name", "vector_score"],
    hybrid_fields = "*",
    k: int = 5,
    print_results: bool = True,
) -> List[dict]:

    # Creates embedding vector from user query
    client = openai.OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY") #os.getenv("OPENAI_API_KEY")
)
    embedded_query = client.embeddings.create(input=user_query,
                                            model="text-embedding-3-small",
                                            ).data[0].embedding

    # Prepare the Query
    base_query = f'{hybrid_fields}=>[KNN {k} @{vector_field} $vector AS vector_score]'
    query = (
        Query(base_query)
         .return_fields(*return_fields)
         .sort_by("vector_score")
         .paging(0, k)
         .dialect(2)
    )
    params_dict = {"vector": np.array(embedded_query).astype(dtype=np.float32).tobytes()}
    
    # perform vector search
    results = client_dev.ft(index_name).search(query, params_dict)
    # print(results)
    if print_results:
        for i, article in enumerate(results.docs):
            score = 1 - float(article.vector_score)
            print(f"{i}. {article.} (Score: {round(score ,3) })")
    return results.docs

In [16]:
results = search_redis(
    client_dev,
    "",
    k=5,
    index_name="index",
    vector_field="content_vector",
    return_fields=["source", "document_id", "vector_score"],
    # hybrid_fields=create_hybrid_field("name", "Extract"),
)

0. 283357 (Score: 0.454)
1. 293465 (Score: 0.446)
2. 270078 (Score: 0.445)
3. 295147 (Score: 0.443)
4. 286378 (Score: 0.441)


### Testing Data

In [2]:
import json
import pandas as pd

def load_pqa(file_name,number_rows=1000):
    qa_list = []
    df = pd.DataFrame(columns=('question', 'answer'))
    with open(file_name) as f:
        i=0
        for line in f:
            data = json.loads(line)
            df.loc[i] = [data['questionText'],data['answers'][0]['answerText']]
            i+=1
            if(i == number_rows):
                break
    return df


qa_list = load_pqa('test-qar_all.jsonl',number_rows=1000)

In [4]:
qa_list.head()

Unnamed: 0,question,answer
0,# of caloriesfor 1 cand yginger ?,Hi...Melissa. Each Natural Ginger Tummy Drops...
1,is the kotobukiya superman a resin statue?,I am unsure I do not see specifics
2,Does it have word ?,It has Office 4.0.1
3,Is this the 2012 version with the mounted boar...,I bought this in 2013 so I assume so. It's ver...
4,"I have a Maine Coon, (beautiful) a shelter cat...","Does she try to""fish"" things out? You could p..."


In [5]:
NUMBER_PRODUCTS=100
product_metadata = qa_list.head(NUMBER_PRODUCTS).to_dict(orient='index')

In [6]:
# Creates embedding vector from user query
def emb(user_query):
    client = openai.OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY") #os.getenv("OPENAI_API_KEY")
    )
    embedded_query = client.embeddings.create(input=user_query,
                                            model="text-embedding-3-small",
                                            ).data[0].embedding
    return embedded_query

In [10]:
item_keywords =  [product_metadata[i]['question']  for i in product_metadata.keys()]
item_keywords_vectors = [emb(sentence) for sentence in item_keywords]

In [22]:
def load_vectors(client, qa_list, vector_dict, vector_field_name):
    for index in product_metadata.keys():    
        # Hash key
        key = 'product:' + str(index)
        
        # Hash values
        item_metadata = product_metadata[index]
        item_keywords_vector = np.array(vector_dict[index], dtype=np.float32).tobytes()
        item_metadata[vector_field_name] = item_keywords_vector
        
        # HSET
        client.hset(key, mapping=item_metadata)

In [13]:
def create_hnsw_index (create_hnsw_index,vector_field_name,number_of_vectors, vector_dimensions=len(item_keywords_vectors[0]), distance_metric='L2',M=40,EF=200):
    client_dev.ft(INDEX_NAME).create_index([
        VectorField("question_vector", "HNSW", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric, "INITIAL_CAP": number_of_vectors, "M": M, }),
        TextField("question"),        
        TextField("answer"),
          
    ])    

In [15]:
INDEX_NAME='indx:pqa_vss'
NUMBER_PRODUCTS=100
ITEM_KEYWORD_EMBEDDING_FIELD='question_vector'

In [20]:
create_hnsw_index(client_dev,INDEX_NAME,NUMBER_PRODUCTS)

In [23]:
INDEX_NAME='indx:pqa_vss'
ITEM_KEYWORD_EMBEDDING_FIELD='question_vector'
TEXT_EMBEDDING_DIMENSION=len(item_keywords_vectors[0])
NUMBER_PRODUCTS=100

print ('Loading and Indexing + ' +  str(NUMBER_PRODUCTS) + ' products')

#flush all data
#client_devo.flushall()

load_vectors(client_dev,product_metadata,item_keywords_vectors,ITEM_KEYWORD_EMBEDDING_FIELD)

Loading and Indexing + 100 products


In [24]:
info = client_dev.ft(INDEX_NAME).info()
num_docs = info['num_docs']
space_usage = info['space_usage']
num_indexed_vectors = info['num_indexed_vectors']
vector_space_usage = (info['vector_space_usage'])

print(f"{num_docs} documents ({space_usage} space used vectors indexed {num_indexed_vectors} vector space usage in {vector_space_usage}")

100 documents (11408192 space used vectors indexed 100 vector space usage in 11356030


In [25]:
results = client_dev.ft(INDEX_NAME).search('*')

In [27]:
results.docs

[Document {'id': 'product:0', 'payload': None, 'answer': "Hi...Melissa.  Each Natural Ginger Tummy Drops has 10 calories and if you are a diabetic...3 carbs. It's directions say to take a maximum of 8 pieces a day. Remember this is not like eating hard candy, it is a supplement.  So it also says keep away from children so they don't confuse it with hard candy.  I used two while flying partly cause my mouth was dry and I wanted assurance I wouldn't get sick, all I did was let it melt in my mouth.  I drink diet Ginger ale all the time, yet I hate gingerbread cookies, gingersnap type, these did not have a strong flavor to them. I don't know if I mentioned I went on a short cruise, which the crew said it was the roughest the seas had been in 6 months, I ended up giving some to two employees because they were getting seasick and they helped them too.Hope I answered enough for you, if not ask anything about them, I will answer again.  Thanks.  Pat", 'question': '# of caloriesfor 1 cand yging

In [29]:
topK=5
user_query='Does this work with xbox'
#vectorize the query
query_vector = np.array(emb(user_query), dtype=np.float32).tobytes()

In [31]:
#Print similar products and questions found
for product in results.docs:
    print ('***************Product  found ************')
    print ('hash key = '  + product.id)
    print ( 'question = '  + product.question)
    print ('answer = ' + product.answer)

***************Product  found ************
hash key = product:62
question = does it work with the xbox one console?
answer = No it says it specifically on the xbox adapter page on xbox.com that it does not work with the xbox one.
***************Product  found ************
hash key = product:52
question = Does This Work With Time Warner Internet. I needit for Netflix Hulu & Amazon movies
answer = Don't know about time warner I have charter but I don't see how they make a better one them this it great
***************Product  found ************
hash key = product:46
question = does it work with sony mini dvds
answer = Yes
***************Product  found ************
hash key = product:73
question = Will this work on my 2013 GMC 3500HD Duramax diesel crew cab???
answer = I have no idea. The tech support guys at 800-998-6880 are very helpful and will steer you in the right direction.
***************Product  found ************
hash key = product:14
question = Is it compitable with WWindows Vis

In [32]:
results

Result{5 total, docs: [Document {'id': 'product:62', 'payload': None, 'question': 'does it work with the xbox one console?', 'answer': 'No it says it specifically on the xbox adapter page on xbox.com that it does not work with the xbox one.'}, Document {'id': 'product:52', 'payload': None, 'question': 'Does This Work With Time Warner Internet. I needit for Netflix Hulu & Amazon movies', 'answer': "Don't know about time warner I have charter but I don't see how they make a better one them this it great"}, Document {'id': 'product:46', 'payload': None, 'question': 'does it work with sony mini dvds', 'answer': 'Yes'}, Document {'id': 'product:73', 'payload': None, 'question': 'Will this work on my 2013 GMC 3500HD Duramax diesel crew cab???', 'answer': 'I have no idea. The tech support guys at 800-998-6880 are very helpful and will steer you in the right direction.'}, Document {'id': 'product:14', 'payload': None, 'question': 'Is it compitable with WWindows Vista?', 'answer': "If it works 