# Introducing multi tools to the agent

### Rebuild Qdrant colletion combining 2 datasets: items metadata + reviews.

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct, MatchAny, FieldCondition, Filter, Prefetch, FusionQuery

import pandas as pd
import openai
import json
import tiktoken

### Data: *items* dataset

Read

In [None]:
# Read
df_items = pd.read_json("../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)

In [None]:
# Sample
df_items_sample = df_items.sample(n=50, random_state=25)

Preprocess

In [None]:
# New column: Extract the title and features from the items data
def preprocess_items_data(row):
    return f"{row['title']} {' '.join(row['features'])}"

# New column: Extract the first large image
def extract_first_large_image(row):
    return row['images'][0].get('large', '')

In [None]:
df_items_sample["preprocessed_data"] = df_items_sample.apply(preprocess_items_data, axis=1)
df_items_sample["first_large_image"] = df_items_sample.apply(extract_first_large_image, axis=1)
df_items_sample.head(2)

### Data: *reviews* data

Read

In [None]:
df_reviews = pd.read_json("../data/Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)
len(df_reviews)

In [None]:
# Filter reviews to only include reviews for items in the sample (parent_asin us item id)
df_reviews_sample = df_reviews[df_reviews['parent_asin'].isin(df_items_sample['parent_asin'])]
print(df_reviews_sample.shape)
df_reviews_sample.head(2)

Preprocess

In [None]:
# Merge title and text into a single string
def preprocess_reviews_data(row):
    return f"{row['title']} {row['text']}"

# Count the number of tokens in the preprocessed data
def token_count(row, model="text-embedding-3-small"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(row["preprocessed_data"]))

In [None]:
# Preprocess
df_reviews_sample["preprocessed_data"] = df_reviews_sample.apply(preprocess_reviews_data, axis=1)
df_reviews_sample["preprocessed_data_token_count"] = df_reviews_sample.apply(token_count, axis=1)
print(df_reviews_sample.shape)

# Filter out reviews that are too long for embedding (tiktoken length limit, for specific model)
df_reviews_sample = df_reviews_sample[df_reviews_sample["preprocessed_data_token_count"] < 8192]
print(df_reviews_sample.shape)
df_reviews_sample.head(2)


### New Qdrant collection (items)

In [None]:
qdrant_client = QdrantClient(url="http://localhost:6333")

In [None]:
# Create collection for items
# In Qdrant if the vectors_config is specified, it implicitly instructs to build a vector index.
# Payload Indexes (for text and parent_asin) require an explicit call to create_payload_index.
# Vector Indexing (for the vector field itself) is implicitly enabled by default when you define the vector configuration in create_collection.
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-02-items",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE), # implies to build a vector index -automatically enables HNSW indexing
)

In [None]:
 # Create a searchable text index on the text field (no data yet)
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-items",
    field_name="text",
    field_schema=PayloadSchemaType.TEXT # qdrant schema type for text
)

In [None]:
# Create a searchable keyword index on id field (parent_asin) (no data yet)
# Index on id to make it faster to retrieve items by id
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-items",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD # For categorical text (like tags, ids, etc.); only supports exact matches
)

### New Qdrant collection (reviews)

In [None]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-02-reviews",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

In [None]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-reviews",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

### Embed & add to Qdrant collection

Functions

In [None]:
# Embed a single text string (old function); takes a long time
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [None]:
# Embed in baches, multiple text strings at once (faster); open ai supports
def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(input=text_list, model=model)
        return [embedding.embedding for embedding in response.data]
    
    all_embeddings = []
    counter = 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        all_embeddings.extend([embedding.embedding for embedding in response.data])
        print(f"Processed {counter * batch_size} of {len(text_list)}")
        counter += 1
    
    return all_embeddings

Items: Embed the text data and add additional fields to the payload of each vector (items)

In [None]:
# Keep only the columns we need for embedding
data_to_embed_items = df_items_sample[[
    "preprocessed_data", 
    "first_large_image", 
    "rating_number", 
    "price", 
    "average_rating", 
    "parent_asin"
    ]].to_dict(orient="records")


data_to_embed_items[0]

In [None]:
# We embed only the preprocessed text column, everything else will be stored in the payload
text_to_embed_items = [data["preprocessed_data"] for data in data_to_embed_items]
embeddings_items = get_embeddings_batch(text_to_embed_items)

In [None]:
# Create pointstructs for items
pointstructs = []
i = 1
for embedding, data in zip(embeddings_items, data_to_embed_items):
    pointstructs.append(
        PointStruct(
            id=i, # generated id
            vector=embedding,
            payload={
                "text": data["preprocessed_data"],
                "first_large_image": data["first_large_image"],
                "average_rating": data["average_rating"],
                "rating_number": data["rating_number"],
                "price": data["price"],
                "parent_asin": data["parent_asin"], # real product id
            }
        )
    )
    i += 1

In [None]:
# Add embeddings & payload to the Qdrant collection
qdrant_client.upsert(
    collection_name="Amazon-items-collection-02-items",
    wait=True,
    points=pointstructs
)

Reviews: Embed the text data and add additional fields to the payload of each vector (reviews)

In [None]:
# Keep only the columns we need for embedding
data_to_embed_reviews = df_reviews_sample[["preprocessed_data", "parent_asin"]].to_dict(orient="records")

# Embed the text
text_to_embed_reviews = [data["preprocessed_data"] for data in data_to_embed_reviews]
embeddings_reviews = get_embeddings_batch(text_to_embed_reviews)
len(embeddings_reviews)

In [None]:
pointstructs = []
i = 1
for embedding, data in zip(embeddings_reviews, data_to_embed_reviews):
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={
                "text": data["preprocessed_data"],
                "parent_asin": data["parent_asin"],
            }
        )
    )
    i += 1

In [None]:
batch_size_qdrant = 100
counter = 1
for i in range(0, len(pointstructs), batch_size_qdrant):
    batch = pointstructs[i:i + batch_size_qdrant]
    qdrant_client.upsert(
        collection_name="Amazon-items-collection-02-reviews",
        wait=True,
        points=batch
    )
    print(f"Processed {counter * batch_size_qdrant} of {len(pointstructs)}")
    counter += 1

### Items: Hybrid search (vector + key = merge using rrf)

In [None]:
from qdrant_client.models import Prefetch, Filter, FieldCondition, MatchText, FusionQuery

def retrieve_data(query, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-02-items",
        prefetch=[
            Prefetch(
                query=query_embedding,
                limit=20
            ),
            Prefetch(
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="text",
                            match=MatchText(text=query)
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"), # megers both prefetched type results
        limit=k
    )

    return results

In [None]:
# Retrieve from items collection
result = retrieve_data("earphones")
result.points

In [None]:
# Get IDs the were retrieved from items collection
parent_asins = []
for data in result.points:
    parent_asins.append(data.payload["parent_asin"])
parent_asins

### Reviews: Vector search withing filtered records

A function to run search agains reviews on a prefiltered set of product IDs

In [None]:
def retrieve_prefiltered_reviews_data(query, parent_asins, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-02-reviews",
        prefetch=[
            Prefetch(
                query=query_embedding, #  cosine similarity (vector) search, within the filtered subset of data
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="parent_asin", # filter exact id match, narrow down the search space
                            match=MatchAny(
                                any=parent_asins
                            )
                        )
                    ]
                ),
                limit=20 
            )
        ],
        query=FusionQuery(fusion="rrf"), # not really needed, as we only have one prefetch, which filters by id
        limit=k
    )

    return results

In [None]:
reviews = retrieve_prefiltered_reviews_data(
    "bad quality", 
    ['B09NLTDHQ6', 'B0C6KBJMHP', 'B098K6N6TX', 'B0B1DM4Y5C', 'B09Q5W9HPQ'], 
    k=5)
reviews.points