# Redis with Langchain

In [17]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [20]:
hugging_face_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
langchain_token = os.getenv("LANGCHAIN_API_KEY")

pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENV")
pinecone_index_host = os.getenv("PINECONE_INDEX_HOST")


Reddis connection

In [21]:
redis_url = 'redis://default:I9CIq7d75KoXtzJAdjcifZvFr7EmoX3b@redis-18938.c3.eu-west-1-2.ec2.redns.redis-cloud.com:18938'


host = 'redis-18938.c3.eu-west-1-2.ec2.redns.redis-cloud.com'
password = 'I9CIq7d75KoXtzJAdjcifZvFr7EmoX3b'
port = 18938

In [22]:
import redis
from langchain.vectorstores.redis import Redis

r = redis.Redis(
    host=host,
    port=port,
    password=password
)

Ping redis host to see if we are connected

In [23]:
r.ping()

True

Clear Redis db

In [24]:
r.flushdb()
r.flushall()

True

Get Redis keys

In [25]:
r.keys()

[]

Get amazon fashion review dataset

In [26]:
import gzip
import json
import pandas as pd

def get_data(asin):
    data = []
    with gzip.open('../data/AMAZON_FASHION.json.gz') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    df = pd.DataFrame.from_dict(data)
    df = df[df['reviewText'].notna()]
    df = df.loc[df['asin'] == asin].copy()
    return df

Get only a slice of the data

In [27]:
df = get_data('B000KPIHQ4').reset_index()[['overall','asin','reviewText','summary', 'reviewerID']]

In [29]:
df.head()

Unnamed: 0,overall,asin,reviewText,summary,reviewerID
0,3.0,B000KPIHQ4,"Good price, good product. Howver, it is generi...",Orthotics off the rack,A1CIM0XZ3UA926
1,5.0,B000KPIHQ4,My husband rates these insoles a 5 for comfort...,Very comfortable,A1EVVPCWRW5YYZ
2,5.0,B000KPIHQ4,I have worn the Powerstep Pinnacle shoe insole...,... Pinnacle shoe insoles for the past 5 years...,A2P3NZ9H4PANK0
3,1.0,B000KPIHQ4,Very uncomfortable feel like I wasted my money!,Uncomfortable,A2975GY186VV7A
4,5.0,B000KPIHQ4,work perfect,Five Stars,A3U8E58RIKWDAW


In [31]:
missing_count = df.isnull().sum()
print(missing_count)

overall       0
asin          0
reviewText    0
summary       2
reviewerID    0
dtype: int64


Drop missing values

In [32]:
df.dropna(subset=['summary'], inplace=True)

See if any we have missing values

In [33]:
missing_count = df.isnull().sum()
print(missing_count)

overall       0
asin          0
reviewText    0
summary       0
reviewerID    0
dtype: int64


Trunchate the reviews to a maximum length

In [34]:
def trunchate_review(text, max_length=400):
    return text[:max_length]

df['reviewText'] = df.apply(lambda row: trunchate_review(row['reviewText']), axis=1)

df['overall'] = df.apply(lambda row: int(row['overall']), axis=1)


In [35]:
df

Unnamed: 0,overall,asin,reviewText,summary,reviewerID
0,3,B000KPIHQ4,"Good price, good product. Howver, it is generi...",Orthotics off the rack,A1CIM0XZ3UA926
1,5,B000KPIHQ4,My husband rates these insoles a 5 for comfort...,Very comfortable,A1EVVPCWRW5YYZ
2,5,B000KPIHQ4,I have worn the Powerstep Pinnacle shoe insole...,... Pinnacle shoe insoles for the past 5 years...,A2P3NZ9H4PANK0
3,1,B000KPIHQ4,Very uncomfortable feel like I wasted my money!,Uncomfortable,A2975GY186VV7A
4,5,B000KPIHQ4,work perfect,Five Stars,A3U8E58RIKWDAW
...,...,...,...,...,...
4366,5,B000KPIHQ4,My podiatrist recommended these Powerstep Inso...,Superb comfort and support,A7XUE85HN4TX0
4367,3,B000KPIHQ4,They worked well for the first few weeks and t...,Slid around in my shoes a bit,A1KTBKOTBM436K
4368,5,B000KPIHQ4,I think these are the best.,Five Stars,A3S84SEN2Z6DZU
4369,5,B000KPIHQ4,"With plantar fasciitis, my feet hurt all the t...","With plantar fasciitis, my feet hurt all the t...",A3JRCF1I92QLW0


#### Create embeddings

In [36]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

  from tqdm.autonotebook import tqdm, trange


Get first 100 reviews and metadata

In [52]:
# we convert the dataframe to a dictionary where the keys are the row indices 
# and the values are dictionaries of column data for each row
reviews=df.head(100).to_dict(orient='index') 

texts=df.head(100)['reviewText'].to_list()
summaries=df.head(100)['summary'].to_list()

# Grab the metadata that will hold only the ovearll review score
metadata=[dict(rating=i) for i in df.head(100)['overall'].tolist()]

# Create the embeddings for the reviews and the summaries
summary_vectors=embeddings.embed_documents(summaries)
vectors=embeddings.embed_documents(texts)

Get one review dict

In [48]:
for value in reviews.values():
    print(value)
    break

{'overall': 3, 'asin': 'B000KPIHQ4', 'reviewText': 'Good price, good product. Howver, it is generic and if you really need orthotics, best to have them individually fitted. These are a good value.', 'summary': 'Orthotics off the rack', 'reviewerID': 'A1CIM0XZ3UA926'}


Get one review text

In [49]:
for t in texts:
    print(t)
    break

Good price, good product. Howver, it is generic and if you really need orthotics, best to have them individually fitted. These are a good value.


Get the metadata for one review

In [53]:
metadata[0]

{'rating': 3}

Make sure they all have the same length

In [50]:
print(len(vectors))
print(len(summary_vectors))
print(len(reviews))
print(len(metadata))

100
100
100
100


Get first review

In [26]:
print(reviews[0])

{'overall': 3, 'asin': 'B000KPIHQ4', 'reviewText': 'Good price, good product. Howver, it is generic and if you really need orthotics, best to have them individually fitted. These are a good value.', 'summary': 'Orthotics off the rack', 'reviewerID': 'A1CIM0XZ3UA926'}


See what indexes we have in Redis

In [54]:
r.execute_command('FT._LIST')

[]

## Add data to Redis using Lanchain

##### We first add the data using Langchain

In [55]:
vstore = Redis.from_texts(texts, embeddings, redis_url=redis_url,  index_name='reviewidx')

vstore_meta = Redis.from_texts(texts, embeddings, redis_url=redis_url, index_name='reviewidx_meta',metadatas=metadata)

Check again the indexes in Redis

In [56]:
r.execute_command('FT._LIST')

[b'reviewidx_meta', b'reviewidx']

Grab data from the index

In [81]:
r.keys()

[b'doc:reviewidx_hash:A3I5AVQW6V2YCW',
 b'doc:reviewidx_json:A1EVVPCWRW5YYZ',
 b'doc:reviewidx:0d6cc837b2184c6381ff980c652d0212',
 b'doc:reviewidx_meta:28bfb7992b4547e0ada12b25b4ac3931',
 b'doc:reviewidx_hash:A2YQ45YFQX57OB',
 b'doc:reviewidx_json:A3U47FH0XDVY7L',
 b'doc:reviewidx_meta:969a2c43d95e4cfba881014371275fb5',
 b'doc:reviewidx_meta:8e5140f8b74d4204923bb8e9fa32ec77',
 b'doc:reviewidx_hash:A2T0L8ZN1VV216',
 b'doc:reviewidx:3a65a66dd4a8446897ec1c1c3e0c272a',
 b'doc:reviewidx:2ca8ff60c9404a4db465fc24bb787a4f',
 b'doc:reviewidx:b21ec28e045547728ed1fbab910a3610',
 b'doc:reviewidx_json:A22JSJ7L4L9PX3',
 b'doc:reviewidx_hash:A1619KUEZRDTEN',
 b'doc:reviewidx_json:A1P5KYI7TE3RRA',
 b'doc:reviewidx:42e68068fd114d0d9c38bc85e5971cbd',
 b'doc:reviewidx:14a6c94f0b444f4d8eb9b3ef20dcbf66',
 b'doc:reviewidx:63131599b7064e5b9343857a3790e8b2',
 b'doc:reviewidx_json:AH44ABIMZREO8',
 b'doc:reviewidx_meta:de49ce3c51c74bd0ad23aae64b476994',
 b'doc:reviewidx:9cca2735260041a09ab9ecbcc6cbab64',
 b'doc

Here we see the content_vector and the content inside an key from the index

In [82]:
r.hgetall('doc:reviewidx:a2f7ef92a0e144ae94a791a29960696d')

{b'content_vector': b'v-\n\xbd\xfd\xe7<\xbd\xc5\xf0\x9f\xbc\x11Y\x00=\xdb\xaeM\xbd\xab\x18N=\xa8\xae\xab\xbb\x9b\xab\x92=\xa3,\xd4<L\xa6T=\x9d\x9b\xa1=\xaf\x9c\xad:X\xd8f=\xc8f\n=\xb2\x9eW=\xdb\x86\x8f<\xfeh\xe6\xbc\x00=\x88<0\x9e\xd1:\xa0D\xed\xbb\xc5T8<\x95\xe1"\xbckx`\xbc\xfe\x93\x14=!\xca`\xbc\xf4_{\xbd\xfe\xe6\x9d=R\xed\xb9<\x1b$\xbe;\xc3#\xc1\xba7\x98\xa2<=>I\xbd\xf3\xe0\x83</\x9d\x91\xbd\x87a\xeb5_\xdb&\xbc\xee\xa1\xdd\xbc\x1c\xeb}\xbc8\xa9\x9c;\x130\xda\xbc\xd0)\xf2<\xc9Z<\xbdE2\xc1\xbc$I\x92<\x19\xd5\x85<\xa7`\x19\xbd\xa0\xfa\t=\xba\xa4\xf5<\x87/\xd2\xbdB|\xac<\xa0\xe2\x1a\xbb9\x94\xd7;sf\x9f\xbc\x85\xd3\xef:\xba\xb8\x8f<\x87\x8e\x88=`\xce\xa7\xbc\x81\x17\x18\xbc\xd7C\xa9=!\x02\xe0<\xc1}\xed\xbcx\xbe\xf2\xbc#w\x14\xbd\xce\xfc\x88\xbd\x07\xca\xb0<\xa66\xb3;\xc8\x88\t=\xf8\xf7\x03\xbd\x00<v<\x82N4\xbc\xd1\x84\xda:\x14|\xf7;x\xb4\xcf<\xee\x17I=_?F=V~\x8f=$Q\x8c\xbc\x19+\x00\xbdc\xdb\x08=\x9f"\xcf\xbcab\x7f=\x19\xd0\x0e=\xaa\x81\xa2\xbc\xc7\xbc\xac:\x9f:\x11<\xbe3\xc0</\r\x84;#[\x

Here we see what we have in the reviewidx_meta index, here besides the vector and the content we also have some metadata properites, the overall rating

In [83]:
r.hgetall('doc:reviewidx_meta:177cdd2eceb641d792eff4446f564e39')

{b'content_vector': b'\xb2:\xce\xbc\x07X\x06\xbdk\xb9\x8a\xbc\x83>><\x92&G\xbd\x07I\r<\x91\xba\x9f<\xee_a=\x03\x00,=\x15M6=\xfc\x08\xbb<2>\xaa;\xe0:\x18:PS\x04=\xfb\xfe"=\xf1\xf1w=\t\\\x97<Q\x91\x83=\xb5\xb3|\xbd\xfd\x9b\x05\xbc\x9b/\x83<\xe6\xb3\xa4;\x05\xb5\'\xbdIOu=\xd6C\xae\xbcO\x9e1\xbdE\xa8\x91=\xaav\x19=e\xd9\x06<\t\x19\x81\xbc\xfd\xe6\x8e\xbb\xe7\xd2k<\xa6\xacd\xbc[\xa13\xbd\x86G\xc55\xe0.\xd2\xbc)\xd7\x08\xbd\t*\x82\xbc\xafO\xcf<pt\xc7\xbc\xd2\x82\x83;}\tU\xbdu\x0c\'\xbc\x17\x0eu\xba\x04?\xba\xbc\x9e\xccH\xbd\xc7\xb7\xd2\xbc\xa6_\x1d=(0\xf0;\x077\x13= \xe2\x91\xbc#\\?<\x94\xe4\x99\xb8\x87\x13\x84=VO&=o"\xaf<\xa9H\xdc\xbcB\x8c\xc4\xbd\x12\xc7v=\x8b\xc8\xf3\xba\xd74\x9a\xbc\x81\xbc\x1d\xbd\xa3h\x1c\xbdR\xd1n\xbd\x07\x01)=\x1bW\'=\xf8\x15\xfb<\xa4\xbb\xde\xbc\xc3@\x19\xbd/\xf5\xfc\xbc\x14\x12,=O\xd0\xcd<:\x19e=\xbf\xf46=\t\x13\x16=\xd6\xd7\x15\xbc\xc9r\x11\xbcQ\xde\x92\xbc1/_\xbc\x89\x81\x98\xbc\x9d\xb45;\xd8\x84\x98=v\x82\xdc\xbc\xf0\xe41\xbc:\x0f4=\xfe\xa4\x1c=2\xb6\xeb\xbcS\x8

##### Here we add data to Redist using redispy

In [57]:
from redis.commands.search.field import VectorField, TextField, NumericField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType

Create schema for Redis and pipeline

In [60]:
pipe = r.pipeline(transaction=False) # to buffer individual commands and execute them as a group

Define the schema - this will have the embeddings added as binaries

In [61]:
schema = (
    TextField("overall"),
    TextField("reviewText"),
    TextField("summary"),
    VectorField("vector", "HNSW", {"TYPE": "FLOAT32", "DIM": 768, "DISTANCE_METRIC": "COSINE"}), # HNSW - if you prefer speed over recall, flat is the oposite, if you prefer quality
)

Define the Index - hash index for the binaries

In [None]:
prefix="doc:reviewidx_hash"

r.ft("reviewidx_hashed").create_index(fields=schema,
                                      definition=IndexDefinition(prefix=[prefix],index_type=IndexType.HASH)
                                     )

Now we should have 3 indexes

In [64]:
r.execute_command('FT._LIST')

[b'reviewidx_hashed', b'reviewidx_meta', b'reviewidx']

#### Upload the data to the Redis Index

We store the data in byte format, each key will consist the prefix : + the reviewer ID of that review

In [65]:
import numpy as np

for i in reviews.keys():
    key = prefix + ':' + reviews[i]['reviewerID']
    record = reviews[i] # We store the review here
    record['vector'] = np.array(vectors[i]).astype(np.float32).tobytes() # we grab the embedding vector of that review and we store it as bytes
    pipe.hset(key, mapping=record)

pipe.execute()

[6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6]

In [69]:
r.keys()

[b'doc:reviewidx_hash:A3I5AVQW6V2YCW',
 b'doc:reviewidx:0d6cc837b2184c6381ff980c652d0212',
 b'doc:reviewidx_meta:28bfb7992b4547e0ada12b25b4ac3931',
 b'doc:reviewidx_hash:A2YQ45YFQX57OB',
 b'doc:reviewidx_meta:969a2c43d95e4cfba881014371275fb5',
 b'doc:reviewidx_meta:8e5140f8b74d4204923bb8e9fa32ec77',
 b'doc:reviewidx_hash:A2T0L8ZN1VV216',
 b'doc:reviewidx:3a65a66dd4a8446897ec1c1c3e0c272a',
 b'doc:reviewidx:2ca8ff60c9404a4db465fc24bb787a4f',
 b'doc:reviewidx:b21ec28e045547728ed1fbab910a3610',
 b'doc:reviewidx_hash:A1619KUEZRDTEN',
 b'doc:reviewidx:42e68068fd114d0d9c38bc85e5971cbd',
 b'doc:reviewidx:14a6c94f0b444f4d8eb9b3ef20dcbf66',
 b'doc:reviewidx:63131599b7064e5b9343857a3790e8b2',
 b'doc:reviewidx_meta:de49ce3c51c74bd0ad23aae64b476994',
 b'doc:reviewidx:9cca2735260041a09ab9ecbcc6cbab64',
 b'doc:reviewidx:c0e4411be54b4e51b6b790ff5967d438',
 b'doc:reviewidx_hash:AH44ABIMZREO8',
 b'doc:reviewidx_hash:A3BSWVQLO70KWJ',
 b'doc:reviewidx_hash:A28ETTPXK0G0SO',
 b'doc:reviewidx_hash:A3VLCZ2RWK

In [70]:
r.hgetall('doc:reviewidx_hash:AH44ABIMZREO8')

{b'reviewText': b"Best innersoles I've tried.",
 b'reviewerID': b'AH44ABIMZREO8',
 b'summary': b'Five Stars',
 b'asin': b'B000KPIHQ4',
 b'vector': b'\xe1\x87\xad<\xbdF]<\xaa\x88\x12\xbc:\xa5\x92<\xb2e\xd5<\xe9\x17Q\xbc\xb3\xcf\xc0\xbc\xad\x12\xb9\xbc\x88\xda\x1e\xbc\xf8>\xe7<\x96O\x91\xbd\x91\xb5\xb4<.!\x12\xbd\xfeZ\xe5\xbc\xb1\xa5\x81<\xc32\xf5;n\xa4\x0b\xbc\xbeU\x0c<\x8e[#\xbd\t\xb1\xc5\xbc\x97i0\xbc\x86\x9f\xbe\xbc;/\xcc\xbck\xaf\x15<m4\xae=\xf8\\\xc0\xbce\xd3\x14\xbbqp\x8e=\x19\xb2!\xbc\x05\\\xe0\xbc\xf2\xf5\xa6\xbc\x85\\\x84\xbc\xed\xd2A\xbd+vc=\x9a\xf9\x026\xe3\x98\xb5<\xf7\x10]\xbd\xd9b+\xbd\xdc\x13\x8d<\x84*\x8d=\xda\x86\xef<_Li\xbd\x1bg5;@c2\xbd#D\xc1\xbbi\x16\xb3\xbc92\xae\xbb\xeee\xad<^?0=\x85\xd6\x1e\xbd%\xc2\xf3\xbc\xfd\xf5V=\xd0\xf3+<\x08\xff\xde<Y\xfe-=\xd7\x83V\xbc\x94W\x84<\x0e\xba\x99\xbd\xff\xf5\x1c=P\xc2\xb2\xbc\x91\\\xa9<U_!\xbd\xe6\xbcJ<\x81S]\xbdh\x03~=o\xf9\x87\xbc\x198\x80\xbc\xb8\xf7\x06<\xb11\x0b\xbd\xf3\x1e\xd0\xbb?a\xf8\xbb|,\x87<!\xed\x8f; \xb2\xa2=\xb3\xb

Store the data in JSON format

We define the schema for a json format index

In [75]:
prefix = "doc:reviewidx_json"

schema = (TextField("$.asin", as_name="asin"),
          TextField("$.reviewText", as_name="reviewText"),
          TextField("$.reviewerID", as_name="reviewerID"),
          TextField("$.summary", as_name="summary"),
          NumericField("$.overall", as_name="overall"),
          VectorField("$.vectors[*]", "HNSW", {"TYPE": "FLOAT32", "DIM": 768, "DISTANCE_METRIC": "COSINE"},
                      as_name="vector")
         )

Create the json index

In [76]:
r.ft("doc:reviewidx_json").create_index(schema, IndexDefinition(prefix=prefix, index_type=IndexType.JSON))

b'OK'

Now we should have another index created, the reviewidx_json 

In [77]:
r.execute_command('FT._LIST')

[b'doc:reviewidx_json', b'reviewidx_hashed', b'reviewidx_meta', b'reviewidx']

Populate the index with data, here we don't transform the data to bytes, we save the vector directly because we are storing these as json objects

In [78]:
for i in reviews.keys():
    key = prefix + ':' + reviews[i]['reviewerID']
    record = reviews[i]
    record['vector'] = vectors[i]
    pipe.json().set(key, '$', record)

pipe.execute()

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [79]:
r.keys()

[b'doc:reviewidx_hash:A3I5AVQW6V2YCW',
 b'doc:reviewidx_json:A1EVVPCWRW5YYZ',
 b'doc:reviewidx:0d6cc837b2184c6381ff980c652d0212',
 b'doc:reviewidx_meta:28bfb7992b4547e0ada12b25b4ac3931',
 b'doc:reviewidx_hash:A2YQ45YFQX57OB',
 b'doc:reviewidx_json:A3U47FH0XDVY7L',
 b'doc:reviewidx_meta:969a2c43d95e4cfba881014371275fb5',
 b'doc:reviewidx_meta:8e5140f8b74d4204923bb8e9fa32ec77',
 b'doc:reviewidx_hash:A2T0L8ZN1VV216',
 b'doc:reviewidx:3a65a66dd4a8446897ec1c1c3e0c272a',
 b'doc:reviewidx:2ca8ff60c9404a4db465fc24bb787a4f',
 b'doc:reviewidx:b21ec28e045547728ed1fbab910a3610',
 b'doc:reviewidx_json:A22JSJ7L4L9PX3',
 b'doc:reviewidx_hash:A1619KUEZRDTEN',
 b'doc:reviewidx_json:A1P5KYI7TE3RRA',
 b'doc:reviewidx:42e68068fd114d0d9c38bc85e5971cbd',
 b'doc:reviewidx:14a6c94f0b444f4d8eb9b3ef20dcbf66',
 b'doc:reviewidx:63131599b7064e5b9343857a3790e8b2',
 b'doc:reviewidx_json:AH44ABIMZREO8',
 b'doc:reviewidx_meta:de49ce3c51c74bd0ad23aae64b476994',
 b'doc:reviewidx:9cca2735260041a09ab9ecbcc6cbab64',
 b'doc

In [80]:
r.json().get('doc:reviewidx_json:A2EKPBI3GVZRMQ', "$")

[{'overall': 5,
  'asin': 'B000KPIHQ4',
  'reviewText': 'Since I got my first pair ,the bottoms of my feet are pain free',
  'summary': 'Five Stars',
  'reviewerID': 'A2EKPBI3GVZRMQ',
  'vector': [-0.053508657962083817,
   -0.0032995743677020073,
   -0.0056513408198952675,
   0.04074309766292572,
   -0.03060976415872574,
   0.02517080120742321,
   -0.0162038654088974,
   0.004735235590487719,
   0.03502847999334335,
   0.035858962684869766,
   0.028859933838248253,
   0.02260727994143963,
   -0.025439893826842308,
   -0.012423180043697355,
   -0.001306538819335401,
   0.10798554122447968,
   0.008506828919053078,
   0.0343514047563076,
   -0.007572143338620663,
   -0.011052518151700497,
   -0.0005712485872209072,
   -0.0342794731259346,
   -0.009586895816028118,
   0.012904232367873192,
   -0.011378000490367413,
   -0.06294053047895432,
   0.0536373034119606,
   0.009454289451241491,
   -0.0035373989958316088,
   -0.000722332508303225,
   -0.007769024930894375,
   0.022377848625183105,

Store multiple numeric arrays as vectors, this is useful when we have multiple vector embeddings representing the same object

We will store the vector of the review and the summary of that review, so the VectorField will be an array with two values

In [84]:
prefix = "doc:reviewidx_json_multi"

schema = (TextField("$.asin", as_name="asin"),
          TextField("$.reviewText", as_name="reviewText"),
          TextField("$.reviewerID", as_name="reviewerID"),
          TextField("$.summary", as_name="summary"),
          NumericField("$.overall", as_name="overall"),
          VectorField("$.vectors[*]", "HNSW", {"TYPE": "FLOAT32", "DIM": 768, "DISTANCE_METRIC": "COSINE"},
                      as_name="vector") # the only difference is that we add the star operator for the vector field
         )

In [85]:
r.ft("reviewidx_json_multi").create_index(schema,
                                          definition=IndexDefinition(prefix=[prefix], index_type=IndexType.JSON))

b'OK'

Now we should have one more index created

In [86]:
r.execute_command('FT._LIST')

[b'doc:reviewidx_json',
 b'reviewidx_hashed',
 b'reviewidx_meta',
 b'reviewidx',
 b'reviewidx_json_multi']

Store the data with vectors for both the reviews and the summaries

In [None]:
for i in reviews.keys():
    key = prefix + ':' + reviews[i]['reviewerID']
    record = reviews[i]
    record['vectors'] = [vectors[i], summary_vectors[i]]
    pipe.json().set(key, '$', record)
pipe.execute()

In [88]:
r.execute_command('FT._LIST')

[b'doc:reviewidx_json',
 b'reviewidx_hashed',
 b'reviewidx_meta',
 b'reviewidx',
 b'reviewidx_json_multi']

In [None]:
r.keys()

Read one key for multi json values

In [None]:
from redis.commands.json.path import Path
r.json().get('doc:reviewidx_json_multi:A2VOWDGRV2HV9V', '$')

Read only the Vectors list

In [None]:
from redis.commands.json.path import Path
r.json().get('doc:reviewidx_json_multi:A2VOWDGRV2HV9V', Path('.vectors'))

### Vector similarity search in Redis

In [94]:
from redis.commands.search.query import Query

# Formulate the query logic
query_syntax = "*=>[KNN 5 @vector $vec_param AS vector_score]"
vss_query=Query(query_syntax).return_fields("overall", "vector_score").sort_by("vector_score").dialect(2)

# query string embedded and transfered to parameter dictionary to be used in search 
# The term we use to perfrom a semantic search
query_string = "Very uncomfortable"
embedded_query = np.array(embeddings.embed_documents([query_string])).astype(np.float32).tobytes()
params_dict = {"vec_param": embedded_query}

# VSS with Redis
vss_results = r.ft('reviewidx_json_multi').search(vss_query, query_params=params_dict)

These are the top 5 reviews that match the query string semantically 

In [95]:
vss_results

Result{5 total, docs: [Document {'id': 'doc:reviewidx_json_multi:A2975GY186VV7A', 'payload': None, 'vector_score': '0.150121450424', 'overall': '1'}, Document {'id': 'doc:reviewidx_json_multi:A1PWR1BTKHQ6YI', 'payload': None, 'vector_score': '0.215067088604', 'overall': '3'}, Document {'id': 'doc:reviewidx_json_multi:A1EVVPCWRW5YYZ', 'payload': None, 'vector_score': '0.233033597469', 'overall': '5'}, Document {'id': 'doc:reviewidx_json_multi:AYS2XX2ZG696Q', 'payload': None, 'vector_score': '0.375756263733', 'overall': '4'}, Document {'id': 'doc:reviewidx_json_multi:A3KM4IUYZGECBL', 'payload': None, 'vector_score': '0.473852336407', 'overall': '5'}]}

If we want to filter results - for eg we only want reviews that have an overall score of 3

In [102]:
# Formulate the query logic here we add overall score as a filter
query_syntax = "(@overall:3)=>[KNN 5 @vector $vec_param AS vector_score]"
filtered_vss_query=Query(query_syntax).return_fields("overall", "vector_score").sort_by("vector_score").dialect(2)

# query string embedded and transfered to parameter dictionary to be used in search 
# The term we use to perfrom a semantic search
query_string = "Very uncomfortable"
embedded_query = np.array(embeddings.embed_documents([query_string])).astype(np.float32).tobytes()
params_dict = {"vec_param": embedded_query}

# VSS with Redis
filtered_vss_results = r.ft('reviewidx_hashed').search(filtered_vss_query, query_params=params_dict) # vss - vectory similarity search

In [107]:
filtered_vss_results

Result{5 total, docs: [Document {'id': 'doc:reviewidx_hash:A1PWR1BTKHQ6YI', 'payload': None, 'vector_score': '0.375553369522', 'overall': '3'}, Document {'id': 'doc:reviewidx_hash:A1I2P233YDWG84', 'payload': None, 'vector_score': '0.726189613342', 'overall': '3'}, Document {'id': 'doc:reviewidx_hash:ARSARRLD6HGUW', 'payload': None, 'vector_score': '0.759466290474', 'overall': '3'}, Document {'id': 'doc:reviewidx_hash:A3M37ZYY651PZ7', 'payload': None, 'vector_score': '0.806888639927', 'overall': '3'}, Document {'id': 'doc:reviewidx_hash:A3VLCZ2RWKBJ83', 'payload': None, 'vector_score': '0.854798257351', 'overall': '3'}]}

### Pass the returned data to a LLM - to enhance the context of the LLM with our documents data

We use the Mistral ai because it is free

In [111]:
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEndpoint

from langchain.chains import RetrievalQA
from langchain.chains.summarize import load_summarize_chain


repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

# Initialize the HuggingFaceEndpoint
chat_llm = HuggingFaceEndpoint(repo_id=repo_id,
                          temperature=0.2,
                          huggingfacehub_api_token=hugging_face_token)



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\Hori\.cache\huggingface\token
Login successful


#### We do a similarity search useing Langchain Reids

Connect to redis from Langchain using from_existing_index method, we pass the index name the schema of the index, the embeddings and the url

We need to define the schema of the index we want to query

In [132]:
schema = {
    "content": TextField("content"),
    "content_vector": VectorField("content_vector", "HNSW", {"TYPE": "FLOAT32", "DIM": 768, "DISTANCE_METRIC": "COSINE"})
}

vstore = Redis.from_existing_index(index_name='reviewidx', schema=schema, embedding=embeddings,redis_url=redis_url)

Define a chain that allows us to ask questions about the review data, stuff means that we add the docuemnts extracted to the llm to enhance it with the knowledge from  the docs

In [133]:
review_chain = RetrievalQA.from_chain_type(llm=chat_llm, chain_type="stuff", retriever=vstore.as_retriever())

Define the prompt with the question we want to ask the llm that was enahcned with the documents from Redis that match our question

In [134]:
q="""
The reviews you see are for a product called 'Powerstep Pinnacle Orthotic Shoe Insoles'.
What is the overall impression of these reviews? Give most prevalent examples in bullets.
What do you suggest we focus on improving?
"""

result=review_chain.run(q)

score_threshold is deprecated. Use distance_threshold instead.score_threshold should only be used in similarity_search_with_relevance_scores.score_threshold will be removed in a future release.


In [135]:
print(result)



The overall impression of these reviews is that the Powerstep Pinnacle Orthotic Shoe Insoles are highly effective in providing relief from plantar fasciitis and other foot-related issues. Here are some prevalent examples from the reviews:

- Worked for more than three years without any recurrence of foot pain or discomfort.
- Slipped easily into shoes and were comfortable.
- Required almost no care.
- Doctor prescribed and highly recommended.
- Provided good arch support and cushioning.

Based on these reviews, it seems that the Powerstep Pinnacle Orthotic Shoe Insoles are effective in providing relief from foot pain and discomfort, and are comfortable and easy to use. However, some users have mentioned that the edges can fray over time and may need to be replaced.

To improve the product, the manufacturer could consider using more durable materials for the edges of the insoles to increase their longevity. Additionally, they could explore options for customizing the insoles to better

#### How to do vector similarity search using redispy and pass the data to the llm

In [138]:
from langchain_core.documents import Document

We get the documents that match the semantic query

In [139]:
query_syntax = "*=>[KNN 5 @vector $vec_param AS vector_score]"

vss_query=Query(query_syntax).return_fields("overall", "vector_score","reviewText","summary").dialect(2)

query_string="Very uncomfortable"
embedded_query=np.array(embeddings.embed_documents([query_string])).astype(np.float32).tobytes()

params_dict = {"vec_param": embedded_query}

vss_results = r.ft('reviewidx_hashed').search(vss_query, query_params = params_dict)

In [140]:
vss_results

Result{5 total, docs: [Document {'id': 'doc:reviewidx_hash:A2975GY186VV7A', 'payload': None, 'vector_score': '0.365598320961', 'overall': '1', 'reviewText': 'Very uncomfortable feel like I wasted my money!', 'summary': 'Uncomfortable'}, Document {'id': 'doc:reviewidx_hash:AYS2XX2ZG696Q', 'payload': None, 'vector_score': '0.593012928963', 'overall': '4', 'reviewText': 'Comfortable, easy to "install" and I love that these come in "sizes" to match the shoe size, no trimming and misalignment. They\'ve held up well and I\'m running around a hospital all day.', 'summary': 'Comfortable and easy'}, Document {'id': 'doc:reviewidx_hash:A14JFIGG7U8A3D', 'payload': None, 'vector_score': '0.505637288094', 'overall': '5', 'reviewText': "It's great and comfortable", 'summary': 'Five Stars'}, Document {'id': 'doc:reviewidx_hash:A3KM4IUYZGECBL', 'payload': None, 'vector_score': '0.556343197823', 'overall': '5', 'reviewText': 'Very comfortable and well padded.  Only problem, too big for my shoes and boo

In [141]:
docs=[]
for review in vss_results.docs:
    result_string = ''
    result_string += " review score:" + review.overall + ' ' + review.reviewText + ' ' + review.summary
    docs.append(Document(page_content=result_string))

In [142]:
docs

[Document(page_content=' review score:1 Very uncomfortable feel like I wasted my money! Uncomfortable'),
 Document(page_content=' review score:4 Comfortable, easy to "install" and I love that these come in "sizes" to match the shoe size, no trimming and misalignment. They\'ve held up well and I\'m running around a hospital all day. Comfortable and easy'),
 Document(page_content=" review score:5 It's great and comfortable Five Stars"),
 Document(page_content=' review score:5 Very comfortable and well padded.  Only problem, too big for my shoes and boots.  Fit nicely in my sneakers. Very comfortable and well padded'),
 Document(page_content=' review score:3 Kind of hard. Not comfortable at all. Not comfortable at all')]

Here we use load_summarize_chain

In [143]:
prompt_template_summary = """
Write a summary of the reviews:

{text}

The summary should be about five lines long
"""
PROMPT = PromptTemplate(template=prompt_template_summary, input_variables=["text"])
chain = load_summarize_chain(chat_llm, chain_type="stuff", prompt=PROMPT)
summary=chain.run(docs)

In [144]:
print(summary)


These insoles received mixed reviews, with some customers praising their comfort and ease of installation, while others found them to be uncomfortable and ill-fitting. The insoles come in various sizes to match shoe sizes, reducing the need for trimming and misalignment. Some users reported that they held up well, even during long hospital shifts. However, others found them to be too large for their shoes and boots, and not comfortable at all. Overall, the insoles received an average rating, with some customers expressing disappointment and others expressing satisfaction.
