### Hybrid Vector datasource for Advanced Retrieval

In [6]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct, SparseVectorParams, Document, Prefetch, FusionQuery
from qdrant_client.http.models import models
import pandas as pd
import openai
import fastembed

### create hybrid collection

In [None]:
qdrant_client = QdrantClient(url="http://localhost:6333")

### create collection
collection_name = "amazon_items-collection-hybrid-02"

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config = {
        "text-embedding-3-small": VectorParams(size=1536, distance=Distance.COSINE)
    },
    sparse_vectors_config = {
        "bm25": SparseVectorParams(modifier=models.Modifier.IDF)
    }
)

In [None]:
qdrant_client.create_payload_index(
    collection_name="amazon_items-collection-hybrid-02",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

In [None]:
def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    """
    Get embeddings for a list of text using a specified model.
    
    Args:
        text_list (list): List of text strings to embed.
    """
    
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(
            model=model,
            input=text_list
        )
        return [item.embedding for item in response.data]
    
    all_embeddings = []
    counter = 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        response = openai.embeddings.create(
            model=model,
            input=batch
        )
        all_embeddings.extend([item.embedding for item in response.data])
        print(f"Processed {counter * batch_size} out of {len(text_list)}")
        counter += 1
    
    return all_embeddings

In [None]:
df_items = pd.read_json("../../data/meta_Electronics_2022_onwards_with_ratings_100_sample_1000.jsonl", lines=True)

In [None]:
df_items.head()
len(df_items)

In [None]:
def combine_description_and_title(row):
    return f"{row['title']} {row['description']}"
df_items['description'] = df_items.apply(combine_description_and_title, axis=1)



In [None]:
def extract_first_large_image(row):
    return row['images'][0].get("large", "")
df_items['image'] = df_items.apply(extract_first_large_image, axis=1)

In [None]:
data_to_embed = df_items[['description', 'image', "rating_number", "price", "average_rating", "parent_asin"]].to_dict(orient="records")

In [None]:
text_to_embed = [item['description'] for item in data_to_embed]

In [None]:
embeddings = get_embeddings_batch(text_to_embed)

In [None]:
len(embeddings)

In [None]:
pointsstructs = []
i = 1
for embedding, data in zip(embeddings, data_to_embed):
    pointsstructs.append(
        PointStruct(
            id=i,
            vector= { 
                     "text-embedding-3-small": embedding,
                     "bm25": Document(text=data["description"], model="Qdrant/bm25")
            },
            payload=data
    ))
    i += 1

In [None]:
pointsstructs[0].vector.get("bm25")

### Load into new hybrid collection

In [None]:
qdrant_client = QdrantClient(url="http://localhost:6333")

qdrant_client.upsert(
    collection_name="amazon_items-collection-hybrid-02",
    points=pointsstructs[0:500],
    wait=True
)

In [None]:
qdrant_client.upsert(
    collection_name="amazon_items-collection-hybrid-02",
    points=pointsstructs[500:],
    wait=True
)

### Perform hybrid-search

In [None]:
from qdrant_client import QdrantClient
import openai
import instructor
instructor_prompt = instructor.from_openai(openai.OpenAI())

def create_embeddings(text, model="text-embedding-3-small"):
   
    response = openai.embeddings.create(
        model=model,
        input=text
    )
        
    return response.data[0].embedding

def retrieve_embedding_data(qd_client: QdrantClient, query, collection_name, k=5):
    querry_embeddings = create_embeddings(query)
    response = qd_client.query_points(
        collection_name=collection_name,
        prefetch=[Prefetch(
            query=querry_embeddings,
            using="text-embedding-3-small",
            limit=20),
            Prefetch(
                query=Document(text=query, model="Qdrant/bm25"),
                using="bm25",
                limit=20)
            ],
        query=querry_embeddings,
        limit=k,
    )
    retrieved_context_ids = []
    retrieved_context = []
    retrieved_scores = []
    retrieved_context_ratings = []
    
    for point in response.points:
        retrieved_context_ids.append(point.payload["parent_asin"])
        retrieved_context.append(point.payload["description"])
        retrieved_scores.append(point.score)
        retrieved_context_ratings.append(point.payload["average_rating"])

    # return dictionary of retrieved data
    return {
        "context_ids": retrieved_context_ids,
        "context": retrieved_context,
        "scores": retrieved_scores,
        "context_ratings": retrieved_context_ratings
    }

In [7]:
qdrant_client = QdrantClient(url="http://localhost:6333")
sample_query = "top laptop under 1000$"
retrieved_data = retrieve_embedding_data(qdrant_client, sample_query, "amazon_items-collection-hybrid-02", 5)
retrieved_data


UnexpectedResponse: Unexpected Response: 400 (Bad Request)
Raw response content:
b'{"status":{"error":"Wrong input: Not existing vector name error: "},"time":0.0746945}'