### Read the data

In [None]:
# Basic data preparation
import pandas as pd

df_items = pd.read_json(
    "../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", 
    lines=True
)

def preprocess_data(row):
    return f"{row['title']} {' '.join(row['features'])}"
    
df_items["preprocessed_data"] = df_items.apply(preprocess_data, axis=1)

In [None]:
df_items.images[0]

In [None]:
# Add additional data to the collection
def extract_first_large_image(row):
    return row['images'][0].get('large', None)

df_items["first_large_image"] = df_items.apply(extract_first_large_image, axis=1)

In [None]:
df_items.head(2)

In [None]:
df_sample = df_items.sample(n=50, random_state=25)

### Upload to Qdrant

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PayloadSchemaType

qdrant_client = QdrantClient(url="http://localhost:6333")

In [None]:
# qdrant_client.delete_collection(collection_name="Amazon-items-collection-01")
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-02-hybrid",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

In [None]:
# Add index to the collection
# We will be adding index on this field, so we could apply exact search on it

qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-02-hybrid",
    field_name="text",
    field_type=PayloadSchemaType.TEXT, # we tell it that this is a text field
)

In [None]:
#prepare data to embedd

# Columns we will want to write to db; instead of list we will use a dictionary so it would write nicely
data_to_embed = df_sample[[
    "preprocessed_data", "first_large_image", 
    "rating_number","price", "average_rating"
    ]].to_dict(orient="records")

data_to_embed

In [None]:
import openai
from qdrant_client.models import PointStruct

def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding


# Create poinstructs
pointstructs = []
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data["preprocessed_data"]) # embed the column
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={
                "text": data["preprocessed_data"], # we can run context search on embedding (vector) + we indexed so we can use exact search on it
                "first_large_image": data["first_large_image"],
                "rating_number": data["rating_number"],
                "price": data["price"],
                "average_rating": data["average_rating"],
            },
        )
    )   
pointstructs

In [None]:
# Write to db
qdrant_client.upsert(
    collection_name="Amazon-items-collection-02-hybrid",
    wait=True,
    points=pointstructs,
)

### Search

In [None]:
from qdrant_client.models import Prefetch, Filter, FieldCondition, MatchText, FusionQuery

def retrieve_data(query, k=5):
    
    query_embedding = get_embedding(query)
    
    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-02-hybrid",
        prefetch = [ # will return no more than 20 items from each prefetch (40 in total)
            Prefetch(
                query = query_embedding,
                limit = 20), # regular similarity search (dense, similarity can be calculated)
            Prefetch(
                filter = Filter(must = [FieldCondition(key = "text",match = MatchText(text=query))]), # exact keyword search (can be another method, but indexing needs to be matching)
                limit = 20), # sparse, keyword search (many 0s, may not find 20, can be less than 20 items)
        ],
        query=FusionQuery(fusion='rrf'), # rrf = reranker; 
        limit = 5, # will rerank and return top 5 items
    )
    
    return results



In [None]:
retrieve_data("earphones").points

# Scores are reranker scores (can not be compared to interim scores)

### Structured outputs

In [None]:
# pydantic models for output model (json schema)
# instructor to wrap llms calls and ensure output structure
import instructor
from pydantic import BaseModel
from openai import OpenAI
# import os

In [None]:
# Pydantic models for output model (json schema)

class RAGGenerationResponse(BaseModel):
    answer: str


In [None]:
# Run llms call using instructor

client = instructor.from_openai(OpenAI())

prompt = """
You are a helpful assistant.
Return an answer to the question.
Question: What is your name?
"""

response, raw_response = client.chat.completions.create_with_completion(
    model = "gpt-4.1",
    response_model = RAGGenerationResponse,
    messages=[{"role": "user", "content": prompt}],
    temperature = 0.5,
)


In [None]:
response

In [None]:
raw_response