In [39]:
import openai
import fastembed
import pandas as pd

from qdrant_client import QdrantClient, models
from qdrant_client.models import VectorParams, PayloadSchemaType, Distance, SparseVectorParams, PointStruct, Document, Prefetch, FusionQuery



### Create qdrant collection for hybrid search

In [40]:
qdrant_client = QdrantClient(url="http://localhost:6333")

In [None]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-01-hybrid-search",
    vectors_config={"text-embedding-3-small": VectorParams(size=1536, distance=Distance.COSINE)},
    sparse_vectors_config={"bm25": SparseVectorParams(modifier=models.Modifier.IDF)}
)

### Index (parent_asin)

In [43]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-01-hybrid-search",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=7, status=<UpdateStatus.COMPLETED: 'completed'>)

### Create embeddings

In [58]:
# for user query
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [44]:
def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(input=text_list, model=model)
        return [embedding.embedding for embedding in response.data]
    
    all_embeddings = []
    counter = 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        all_embeddings.extend([embedding.embedding for embedding in response.data])
        print(f"Processed {counter * batch_size} of {len(text_list)}")
        counter += 1
    
    return all_embeddings

### Read & apply preprocessing steps to 500 products

In [45]:
df_items = pd.read_json("../../data/meta_Clothing_Stores_and_Jewelry_2022_2023_with_category_ratings_100_sample_500.jsonl", lines=True)

In [46]:
def preprocess_description(row):
    return f"{row["title"]} {" ".join(row["features"])} {" ".join(row["description"])} {" ".join(row["details"])} {" ".join(row["categories"])}"

In [47]:
def extract_large_image(row):
    return f"{row["images"][0].get("large","")}"

In [48]:
def extract_video_url(row):
    # Check if videos list exists and has at least one element
    if row['videos'] and len(row['videos']) > 0:
        return row['videos'][0].get('url', '')
    return ''

In [49]:
def preprocess_details(row):
    if not isinstance(row, dict):
        return "" 
    return ". ".join([f"{key}: {value}" for key, value in row.items()])


In [50]:
df_items['details'] = df_items['details'].apply(preprocess_details)
df_items["description"] = df_items.apply(preprocess_description,axis=1)
df_items["image"] = df_items.apply(extract_large_image, axis=1)
df_items["video"] = df_items.apply(extract_video_url, axis=1)


In [51]:
df_items.shape

(500, 18)

### Data for embedding

In [52]:
data_to_embed = df_items[["parent_asin","description","price", "average_rating", "rating_number", "image", "video", "store"]].to_dict(orient="records")

In [53]:
text_to_embed = [data["description"] for data in data_to_embed]

### Pointstructs (tie together dense, sparse vectors and payload) & Upsert to qdrant

In [54]:
embeddings = get_embeddings_batch(text_to_embed)

Processed 100 of 500
Processed 200 of 500
Processed 300 of 500
Processed 400 of 500
Processed 500 of 500


In [55]:
pointstructs = []
i = 1
for embedding, data in zip(embeddings, data_to_embed):
    pointstructs.append(
        PointStruct(
            id=i,
            vector={
                "text-embedding-3-small": embedding,
                "bm25": Document(
                    text=data["description"],
                    model="qdrant/bm25"
                )
            },
            payload=data
        )
    )
    i += 1

In [56]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-01-hybrid-search",
    points=pointstructs
)

UpdateResult(operation_id=8, status=<UpdateStatus.COMPLETED: 'completed'>)

### Hybrid Retrieval

In [62]:
def retrieve_data(query, qdrant_client, top_k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-01-hybrid-search",
        prefetch=[
            Prefetch(
                query=query_embedding,
                using="text-embedding-3-small",
                limit=20,
            ),
            Prefetch(
                query=Document(
                    text=query,
                    model="qdrant/bm25"
                ),
                using="bm25",
                limit=20,
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=top_k
    )

    retrieved_context_ids = []
    retrieved_context = []
    retrieved_context_ratings = []
    similarity_scores = []

    for result in results.points:
        payload = result.payload or {}
        retrieved_context_ids.append(payload.get("parent_asin"))
        retrieved_context.append(payload.get("description"))
        retrieved_context_ratings.append(payload.get("rating"))
        similarity_scores.append(result.score)
        

    return {
        "retrieved_context_ids": retrieved_context_ids,
        "retrieved_context": retrieved_context,
        "retrieved_context_ratings": retrieved_context_ratings,
        "similarity_scores": similarity_scores
    }


In [63]:
retrieve_data("Can I get a pair of jeans?", qdrant_client, top_k=5)

{'retrieved_context_ids': ['B0C4HFLLBY',
  'B0BVVL6QF7',
  'B0BC4BLVV3',
  'B09R4WN6HN',
  'B0C57GCZTJ'],
 'retrieved_context': ['Oprah Favorite Jeans, Seamed Front Wide Leg Jeans Elastic Waist, 2023 New Wide Leg Jeans for Women Polyester,Spandex,Cotton Imported Pull On closure 【Seamed Front Wide Leg Jeans】 you can wear it comfortably all day long. 【Hidden Tummy Control Design】Set the vibe by pairing them with your choice of jacket, tee, or tank. 【No Button & No Zipper&Perfect Pockets】With a pull-on design, our denim moves and stretches with your body. 【Dye Download 】 Jeans Color May Transfer to Other Lighter Materials or Surfaces. 【Premium Fabric】62% Cotton, 28% Polyester, 10% Spandex. Specification: Name: Seamed Front Wide Leg Jeans Color: Blue, Black  Package Includes: 1 Piece * Seamed Front Wide Leg Jeans  Notes: 1. Due to manual measurements, please allow slight measurement deviations. 2. Due to the different display and lighting effects, the actual color of the item may be slight