# Redis with Langchain

Load .env files and import some libraries

In [2]:
import os
import json
import gzip
import pandas as pd
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

Load the api keys

In [3]:
hugging_face_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
langchain_token = os.getenv("LANGCHAIN_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENV")
pinecone_index_host = os.getenv("PINECONE_INDEX_HOST")

Read the data

In [4]:
data_file_path = '../data/AMAZON_FASHION.json.gz'
metadata_file_path = '../data/meta_AMAZON_FASHION.json.gz'

def load_data(file_path):
    data = []
    with gzip.open(file_path) as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data            

def load_metadata(file_path):
    metadata = []
    with gzip.open(file_path) as file:
        for line in file:
            metadata.append(json.loads(line.strip()))
    return metadata

data = load_data(data_file_path)
metadata = load_metadata(metadata_file_path)

Convert the data to a padnas dataframe

In [5]:
df = pd.DataFrame.from_dict(data)
df = df[df['reviewText'].notna()]

df_meta = pd.DataFrame.from_dict(metadata)

Trunchate reviews so we don't process reviews that are too long

In [9]:
def trunchate_review(text, max_text_length=400):
    return text[:max_text_length]

df['truncated'] = df.apply(lambda row: trunchate_review(row['reviewText']), axis=1)

Find a product that has a lot of reviews - asin --> product id

In [10]:
df.groupby('asin').count().sort_values('overall')

Unnamed: 0_level_0,overall,verified,reviewTime,reviewerID,reviewerName,reviewText,summary,unixReviewTime,vote,style,image,truncated
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0764443682,1,1,1,1,1,1,1,1,0,0,0,1
B019YM0O28,1,1,1,1,1,1,1,1,0,0,0,1
B019YME2N0,1,1,1,1,1,1,1,1,0,0,0,1
B019YMNS08,1,1,1,1,1,1,1,1,0,0,0,1
B00S7N99AY,1,1,1,1,1,1,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
B000PHANNM,2566,2566,2566,2566,2566,2566,2563,2566,85,2563,112,2566
B00RLSCLJM,3633,3633,3633,3633,3633,3633,3632,3633,225,3538,210,3633
B00I0VHS10,3884,3884,3884,3884,3884,3884,3880,3884,128,3872,107,3884
B000KPIHQ4,4371,4371,4371,4371,4370,4371,4369,4371,193,3346,38,4371


Extract the product with a lot of reviews

In [11]:
df_meta[df_meta.asin=='B000KPIHQ4'].values

array([['Powerstep Pinnacle Orthotic Shoe Insoles', nan,
        list(['Shipping Information:\n                    \nView shipping rates and policies']),
        '154inClothing,Shoesamp;Jewelry(', '5 star', 'B000KPIHQ4',
        list(['https://images-na.ssl-images-amazon.com/images/I/414VFpnmvjL._US40_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/51yLLxuD5%2BL._US40_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/51NJmYTkeiL._US40_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41VRUCCVKEL._US40_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/51b-GUTXm0L._US40_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41mORzqQTwL._US40_.jpg', 'https://images-na.ssl-images-amazon.com/images/I/61RHVYCqQcL._US40_.jpg']),
        list(['https://images-na.ssl-images-amazon.com/images/I/414VFpnmvjL.jpg', 'https://images-na.ssl-images-amazon.com/images/I/51yLLxuD5%2BL.jpg', 'https://images-na.ssl-images-amazon.com/images/I/51NJmYTkeiL.jpg', 'https://images-

#### Create the embedding vectors for the review

We will use only a slice of te data

In [12]:
df = df.loc[df['asin'] == 'B000KPIHQ4'].copy()

We will use the Hugging Face embeddings

In [13]:
from langchain.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [14]:
df['embeddings'] = df.apply(lambda row: embeddings.embed_query(row['truncated']), axis=1)

Check the embedding dimensions

In [15]:
first_embedding = df['embeddings'].iloc[0]
embeddings_dimension = len(first_embedding)

print(embeddings_dimension)

384


Embedding dimension is 384

### Upload the data to Pinecone Vector Store

In [23]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_api_key)

Create the index

In [24]:
import time

index_name = "amazon-data-index"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=embeddings_dimension,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

We transfrom the trunchated reviews text column into a list of reviews

In [25]:
texts = df['truncated'].tolist()

In [26]:
texts

['Good price, good product. Howver, it is generic and if you really need orthotics, best to have them individually fitted. These are a good value.',
 "My husband rates these insoles a 5 for comfort. He hasn't noticed any improvment as far as leg or foot pain and has wore them consistantly since Christmas. The owner of the Red Wing store where we get his work boots highly recommended them and he was right. They make heavy, steel toed work boots more bearable. Can't say they will cure or even help with orthopedic problems though. Guess you need to",
 'I have worn the Powerstep Pinnacle shoe insoles for the past 5 years and love them.  They are so comfortable and since I have been wearing them have had no foot pain or other discomfort.',
 'Very uncomfortable feel like I wasted my money!',
 'work perfect',
 'Comfortable, easy to "install" and I love that these come in "sizes" to match the shoe size, no trimming and misalignment. They\'ve held up well and I\'m running around a hospital all 

Upload the reviews with the built in from_texts methods using the hugging face embeddings

In [27]:
from langchain_pinecone import PineconeVectorStore

vstore = PineconeVectorStore.from_texts(texts, embeddings, index_name=index_name)

We do a vector similarity search now

In [28]:
query = 'the quality is good'
result = vstore.similarity_search(query)

In [29]:
result

[Document(page_content='Quality is everything!'),
 Document(page_content='Good Price, Good Quality'),
 Document(page_content='Acceptable quality. No issues'),
 Document(page_content='Good quality, just not right for my feet as I thought.')]

Make the LLM access the data from the vector store and feed it to the language model

We use a hugging face model because is free

In [30]:
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEndpoint

from langchain.chains import RetrievalQA
from langchain.chains.summarize import load_summarize_chain


repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

# Initialize the HuggingFaceEndpoint
chat_llm = HuggingFaceEndpoint(repo_id=repo_id,
                          temperature=0.1,
                          huggingfacehub_api_token=hugging_face_token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\Hori\.cache\huggingface\token
Login successful


Now we define a chain using RetreivalQA that takes the language model, the vector sotre and a chain type, we use 'stuff' because we take the related data from the vector store and we use that in the prompt and we use that as a context and we pass it to the language model

In [31]:
review_chain = RetrievalQA.from_chain_type(llm=chat_llm, chain_type="stuff", retriever=vstore.as_retriever())

Now we define the query

In [32]:
q="""
The reviews you see are for a product called 'Powerstep Pinnacle Orthotic Shoe Insoles'.
What is the overall impression of these reviews? Give most prevalent examples in bullets. 
What do you suggest we focus on improving?
"""

result=review_chain.run(q)
print(result)
     

  warn_deprecated(




Overall Impression:
- The Powerstep Pinnacle Orthotic Shoe Insoles receive high praise from reviewers for their excellent support and affordability.
- They are recommended for individuals with plantar fasciitis and those seeking an affordable alternative to custom orthotics.
- Reviewers have purchased these insoles multiple times due to their durability and effectiveness.

Areas for Improvement:
- Some reviewers suggest that the Powerstep Pinnacle insoles may not be as comfortable or compliant as other insoles for certain individuals.
- It is recommended to try both the Powerstep Pinnacle and Powerstep Original insoles to determine which one is best for individual needs.
- There is a need for more color options and sizes to cater to a wider range of customers.

Additional Insight:
- The Powerstep Pinnacle Orthotic Shoe Insoles are a popular choice among consumers due to their affordability, durability, and effectiveness in providing support for various foot conditions.
- However, it 

#### Filtered Vector similarity search

We want to filter all the reviews of a given rating that match a specific theme so we need to filter on metadata to do that

In [33]:
df=df.rename(columns={'embeddings' : 'values', 'reviewerID' : 'id'})
df['metadata'] = df.apply(lambda row: dict(rating = row['overall']), axis=1)

In [34]:
data = df[['metadata', 'values', 'id']].to_dict(orient='records') # one dict for uploading to vstore
data_local = df[['metadata', 'values', 'reviewText', 'id']].to_dict(orient='records') # one for local to extracting the actual reviews

Create the new index for filtered data

In [35]:
filtered_index_name = "amazon-filtered-data-index"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if filtered_index_name not in existing_indexes:
    pc.create_index(
        name=filtered_index_name,
        dimension=embeddings_dimension,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(filtered_index_name).status["ready"]:
        time.sleep(1)

filtered_index = pc.Index(filtered_index_name)

Upload the data in batches of 50 using the upsert pinecone method

In [36]:
from tqdm.auto import tqdm

for i in tqdm(range(0, len(data), 50)):
    j = i + 50
    if j > len(data):
        j = len(data)
    batch = data[i: j]
    filtered_index.upsert(vectors=batch)

  0%|          | 0/88 [00:00<?, ?it/s]

We will search for a specific query and add a filter of rating eq 4.0

First I will ask a question and embeed it

In [39]:
xq = embeddings.embed_query("will buy again")
print(xq)

[-0.08148110657930374, -0.04554428905248642, -0.009914864785969257, 0.009966453537344933, -0.05037052184343338, -0.04346311092376709, 0.03173936530947685, 0.006288859993219376, 0.02704247459769249, -0.04681912064552307, 0.024790704250335693, 0.04334556311368942, -0.027310164645314217, 0.010995028540492058, -0.0043119341135025024, 0.03811481595039368, -0.037653759121894836, 0.049283698201179504, -0.03616303578019142, -0.0896780788898468, -0.0859656035900116, -0.03157638758420944, -0.024117620661854744, -0.009150802157819271, -0.04708412289619446, 0.03158921003341675, -0.03256243094801903, 0.059578511863946915, 0.022172559052705765, -0.0842202752828598, -0.022154709324240685, 0.04300077259540558, -0.002154940739274025, 0.022822272032499313, 0.08162523061037064, -0.04305995628237724, 0.0434039905667305, -0.055803753435611725, 0.0027836172375828028, -0.014202188700437546, -0.027747441083192825, -0.025026042014360428, -0.06913257390260696, -0.02093113400042057, 0.040932971984148026, 0.09919

Use the similarity search with filters

In [44]:
conditions = {"rating": {"$eq": 4.0}}
results = filtered_index.query(vector=[xq], top_k=10, filter=conditions)

print(results)

{'matches': [{'id': 'ANZQMUPAH06T7', 'score': 0.391356736, 'values': []},
             {'id': 'A2FZD6U4J0X5SJ', 'score': 0.365257353, 'values': []},
             {'id': 'A14ZBZ2XOFA6Y6', 'score': 0.354094744, 'values': []},
             {'id': 'A2VMXOLHVQXU33', 'score': 0.346238375, 'values': []},
             {'id': 'A20MK465OYJ4L5', 'score': 0.346086144, 'values': []},
             {'id': 'A3UK6YCKJ13G83', 'score': 0.305876106, 'values': []},
             {'id': 'A1OHEETPLSNZA5', 'score': 0.297214985, 'values': []},
             {'id': 'A3N0RDRGB9DNQN', 'score': 0.294205248, 'values': []},
             {'id': 'A2ECI8FJ8GYO77', 'score': 0.289532155, 'values': []},
             {'id': 'A1RCFWUHLGHMS4', 'score': 0.289532155, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}


To get the actual review texts we use the local version of the data. We wrap the qyery in a python function that executes a vector similarity search and loops over the local data and extracts all the matching reviews from the local data

In [95]:
# Get the rating from id

get_detailed_data_for_id = {
    x['id']: {
        'id': x['id'],
        'rating': x['metadata']['rating'],
        'review': x['reviewText'],
    } for x in data_local}

In [96]:
# Python function that retrieves reviews matching query and specific rate
def review_and_rating(query, rating, index, top_k=10):
    query = embeddings.embed_query(query)
    results = index.query(vector=[query], top_k=top_k, filter={'rating': {'$eq': rating}})
    id_list = [match['id'] for match in results['matches']]
    details=[]
    for i in id_list:
        details.append(get_detailed_data_for_id[i])
    return pd.DataFrame(details)

In [98]:
review_and_rating("Will buy again", 5.0, filtered_index, 20)

Unnamed: 0,id,rating,review
0,A3S0KCW0QMN2WH,5.0,Would buy again
1,A3PXET0DIW2AFA,5.0,"Good service, will buy again"
2,AF2J5J39XFNQR,5.0,Excellent! Would definitely buy again!
3,A393FUY8Z8QQH0,5.0,"Great Item, I will be it buying again!"
4,A1PE9V3LOUIZ20,5.0,Love them Will be ordering again
5,A14PB7YDEKHNQQ,5.0,Excellent product. Will order again.
6,A4YAKZFSR5JD1,5.0,Happy with purchase
7,A24IDIH2PAOJVJ,5.0,"Awesome, bought 2 more."
8,A2J0HNPQV0TFOQ,5.0,Excellent fit ! Will buy again!
9,A3NWW50UOKECGO,5.0,Happy with my purchase
