### Read the data

In [None]:
# Basic data preparation
import pandas as pd

df_items = pd.read_json(
    "../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", 
    lines=True
)

def preprocess_data(row):
    return f"{row['title']} {' '.join(row['features'])}"
    
df_items["preprocessed_data"] = df_items.apply(preprocess_data, axis=1)

In [10]:
df_items.images[0]

[{'thumb': 'https://m.media-amazon.com/images/I/51G07yWoOBL._SX38_SY50_CR,0,0,38,50_.jpg',
  'large': 'https://m.media-amazon.com/images/I/51G07yWoOBL.jpg',
  'variant': 'MAIN',
  'hi_res': 'https://m.media-amazon.com/images/I/611AVJMH+JL._SL1200_.jpg'},
 {'thumb': 'https://m.media-amazon.com/images/I/41c+40oKkKL._SX38_SY50_CR,0,0,38,50_.jpg',
  'large': 'https://m.media-amazon.com/images/I/41c+40oKkKL.jpg',
  'variant': 'PT01',
  'hi_res': 'https://m.media-amazon.com/images/I/61ihhPW7VCL._SL1200_.jpg'},
 {'thumb': 'https://m.media-amazon.com/images/I/51y1YnwiUZL._SX38_SY50_CR,0,0,38,50_.jpg',
  'large': 'https://m.media-amazon.com/images/I/51y1YnwiUZL.jpg',
  'variant': 'PT02',
  'hi_res': 'https://m.media-amazon.com/images/I/61UkcVETKcL._SL1200_.jpg'},
 {'thumb': 'https://m.media-amazon.com/images/I/41Nvr++q39L._SX38_SY50_CR,0,0,38,50_.jpg',
  'large': 'https://m.media-amazon.com/images/I/41Nvr++q39L.jpg',
  'variant': 'PT03',
  'hi_res': 'https://m.media-amazon.com/images/I/611IpDcR

In [11]:
# Add additional data to the collection
def extract_first_large_image(row):
    return row['images'][0].get('large', None)

df_items["first_large_image"] = df_items.apply(extract_first_large_image, axis=1)

In [12]:
df_items.head(2)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author,preprocessed_data,first_large_image
0,Industrial & Scientific,"RAVODOI USB C Cable, [2Pack/3.3ft+6.6ft] USB T...",4.4,119,[【Fast Charging Cord】These USB C cables provid...,[],,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Type-C Charger Cable ', 'url': 'ht...",RAVODOI,"[Electronics, Computers & Accessories, Compute...","{'Brand': 'RAVODOI', 'Connector Type': 'USB Ty...",B09R4Y2HKY,,,,"RAVODOI USB C Cable, [2Pack/3.3ft+6.6ft] USB T...",https://m.media-amazon.com/images/I/51G07yWoOB...
1,All Electronics,"SNESH-2 Pack USB-C Female to USB Male Adapter,...",4.5,352,[🔹(Light & compact) Easy to carry and light we...,[],4.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'USB Male & Female Adapter', 'url':...",SNESH,"[Electronics, Computers & Accessories, Compute...",{'Package Dimensions': '3.54 x 2.4 x 0.35 inch...,B09JV5FM2S,,,,"SNESH-2 Pack USB-C Female to USB Male Adapter,...",https://m.media-amazon.com/images/I/41bOA5-ogW...


In [17]:
df_sample = df_items.sample(n=50, random_state=25)

### Upload to Qdrant

In [24]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PayloadSchemaType

qdrant_client = QdrantClient(url="http://localhost:6333")

In [14]:
# qdrant_client.delete_collection(collection_name="Amazon-items-collection-01")
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-02-hybrid",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

True

In [16]:
# Add index to the collection
# We will be adding index on this field, so we could apply exact search on it

qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-hybrid",
    field_name="text",
    field_type=PayloadSchemaType.TEXT, # we tell it that this is a text field
)

  qdrant_client.create_payload_index(


UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [18]:
#prepare data to embedd

# Columns we will want to write to db; instead of list we will use a dictionary so it would write nicely
data_to_embed = df_sample[[
    "preprocessed_data", "first_large_image", 
    "rating_number","price", "average_rating"
    ]].to_dict(orient="records")

data_to_embed

[{'preprocessed_data': 'Bluetooth Car Adapter, LDNIO Bluetooth FM Transmitter for Car, 43W PD&QC 3.0 Three USB Port Car Bluetooth Adapter with LED Display, Hands-Free Calling, and AUX Input for All Smartphones Audio Player Fast Charging Type C Multi Ports: USB Type C Durable Fast Connect & Clear Bluetooth Sound 3 in 1 Value',
  'first_large_image': 'https://m.media-amazon.com/images/I/41gqwN41WpL._AC_.jpg',
  'rating_number': 110,
  'price': 21.0,
  'average_rating': 3.8},
 {'preprocessed_data': 'Bluetooth Multi-Device Keyboard, Dual Channel Universal Rechargeable Wireless Keyboard with Integrated Stand for iPad Smartphone Tablet MacBook iOS Windows Android Devices - Pink 【Easy-Switch to 2 Devices】 Simply press the FN+BT1/BT2 to switch typing between 2 connected Bluetooth devices, work with your smartphone and tablet on the slot stablely. 【Type Anywhere in Comfort】0.46in thick and 11.34in long, the compact Bluetooth keyboard is small enough to tuck into your briefcase and light enough 

In [None]:
import openai
from qdrant_client.models import PointStruct

def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding


# Create poinstructs
pointstructs = []
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data["preprocessed_data"]) # embed the column
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={
                "text": data["preprocessed_data"], # we can run context search on embedding (vector) + we indexed so we can use exact search on it
                "first_large_image": data["first_large_image"],
                "rating_number": data["rating_number"],
                "price": data["price"],
                "average_rating": data["average_rating"],
            },
        )
    )   
pointstructs

[PointStruct(id=0, vector=[-0.02746845968067646, -0.025803081691265106, -0.03116930089890957, -0.028064705431461334, -0.027900224551558495, -0.0626264438033104, -0.006533011328428984, 0.025947002694010735, 0.009673585183918476, -0.061187226325273514, 0.02886655554175377, 0.019059328362345695, -0.06266756355762482, 0.017404230311512947, 0.03593927249312401, -0.023418094962835312, -0.06599832326173782, -0.009457702748477459, 0.03423277288675308, 0.055759329348802567, 0.025761961936950684, 0.027201177552342415, 0.06381893903017044, 0.012623977847397327, 0.008167549036443233, -0.025556359440088272, -0.03756352886557579, -0.018432240933179855, -0.023685378953814507, -0.010650196112692356, 0.010295532643795013, -0.009904888458549976, 0.022965770214796066, -0.0005801838124170899, -0.04478016868233681, 0.0060601262375712395, 0.03567199036478996, -0.011750168167054653, 0.007761484477669001, -0.04815204441547394, 0.01923408918082714, -0.007586722727864981, 0.06912347674369812, -0.012572577223181

In [20]:
# Write to db
qdrant_client.upsert(
    collection_name="Amazon-items-collection-02-hybrid",
    wait=True,
    points=pointstructs,
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

### Search

In [21]:
from qdrant_client.models import Prefetch, Filter, FieldCondition, MatchText, FusionQuery

def retrieve_data(query, k=5):
    
    query_embedding = get_embedding(query)
    
    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-02-hybrid",
        prefetch = [ # will return no more than 20 items from each prefetch (40 in total)
            Prefetch(
                query = query_embedding,
                limit = 20), # regular similarity search (dense, similarity can be calculated)
            Prefetch(
                filter = Filter(must = [FieldCondition(key = "text",match = MatchText(text=query))]), # exact keyword search (can be another method, but indexing needs to be matching)
                limit = 20), # sparse, keyword search (many 0s, may not find 20, can be less than 20 items)
        ],
        query=FusionQuery(fusion='rrf'), # rrf = reranker; 
        limit = 5, # will rerank and return top 5 items
    )
    
    return results



In [23]:
retrieve_data("earphones").points

# Scores are reranker scores (can not be compared to interim scores)

[ScoredPoint(id=2, version=2, score=1.0, payload={'text': 'Active Noise Cancelling Wireless Earbuds, Bluetooth in-Ear Headphones Built-in 4 Mic ENC Call, Deep Bass Ear Buds,IPX6 Waterproof Stereo Earphones for iPhone,Samsung,Laptop (Blue) ♪【ANC Noise Cancelling&Game Mode】: These premium noise-canceling earbuds adopts professional audio active noise canceling technology, and the binaural microphone blocks environmental noise and enhances the voice experience. M48 wireless earbuds are built to provide the exceptional mobile gaming audio experience with an ultra-low 45ms latency.Taps 5 times to enter game mode,through quick sound effect and ideal gaming audio you never miss any game sound effects. ♪【HiFi Stereo Sound & Superior Clear Call 】: Heaphones equipped with a high-sensitivity diaphragm and dual 13mm drivers to restore audio authenticity and voice dynamics, crystal clarity and deep resonance bass characteristics enable you to enjoy immersive sound quality. Call noise canceling brin

### Structured outputs

In [30]:
# pydantic models for output model (json schema)
# instructor to wrap llms calls and ensure output structure
import instructor
from pydantic import BaseModel
from openai import OpenAI
# import os

In [31]:
# Pydantic models for output model (json schema)

class RAGGenerationResponse(BaseModel):
    answer: str


In [33]:
# Run llms call using instructor

client = instructor.from_openai(OpenAI())

prompt = """
You are a helpful assistant.
Return an answer to the question.
Question: What is your name?
"""

response, raw_response = client.chat.completions.create_with_completion(
    model = "gpt-4.1",
    response_model = RAGGenerationResponse,
    messages=[{"role": "user", "content": prompt}],
    temperature = 0.5,
)


In [37]:
response

RAGGenerationResponse(answer='My name is ChatGPT, and I am an AI language assistant created by OpenAI.')

In [38]:
raw_response

ChatCompletion(id='chatcmpl-BrtJ1HDU0aITkZRazzPmaiIXcbfUd', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_TxUdsdrSSXBZxyiu2SGTStqS', function=Function(arguments='{"answer":"My name is ChatGPT, and I am an AI language assistant created by OpenAI."}', name='RAGGenerationResponse'), type='function')]))], created=1752183679, model='gpt-4.1-2025-04-14', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=22, prompt_tokens=92, total_tokens=114, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=None, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=None), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))