### Run qdrant 

In [None]:
# Locall deploy qdrant db (wsl)

# 1. Pull the latest qdrant image
# docker pull qdrant/qdrant

# 2. Run the container, bind the ports and volume to the current directory
# docker run -p 6333:6333 -p 6334:6334 \
#     -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
#     qdrant/qdrant

# Access web UI at http://localhost:6333/dashboard

# Issue:
# After docker compose was created i couldn't retrieve from 00 collection, 
# needed to create new / or reopload all data to 00 to be able to retrieve from it
# Data uploadedd via notebook is not accessible from the container.
# To be accessible, we need to upload it via docker compose (see upsert.py) 
# each time we start the container.
# Solution:
# Run everything (docker compose, notebook, project store etc.) from within wsl.

In [1]:
# Import qdrant client
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

In [None]:
# Initialize qdrant client pointing to the localhost through rest api
qdrant_client = QdrantClient(url="http://localhost:6333")
# Dashboard UI: http://localhost:6333/dashboard

### Pre-process data

In [3]:
import pandas as pd

df_items = pd.read_json('../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl', lines=True)
df_items.head(2).T

Unnamed: 0,0,1
main_category,Industrial & Scientific,All Electronics
title,"RAVODOI USB C Cable, [2Pack/3.3ft+6.6ft] USB T...","SNESH-2 Pack USB-C Female to USB Male Adapter,..."
average_rating,4.4,4.5
rating_number,119,352
features,[【Fast Charging Cord】These USB C cables provid...,[🔹(Light & compact) Easy to carry and light we...
description,[],[]
price,,4.99
images,[{'thumb': 'https://m.media-amazon.com/images/...,[{'thumb': 'https://m.media-amazon.com/images/...
videos,"[{'title': 'Type-C Charger Cable ', 'url': 'ht...","[{'title': 'USB Male & Female Adapter', 'url':..."
store,RAVODOI,SNESH


In [4]:
# We will use & embeddg into the vector db 3 columns, containing most info about the procuct: 
# title, decription and features
# We need to concatinate them into a single string, and then embedd it

def preprocess_data(row):
    # return f'{row["title"]} {''.join(row["description"])} {''.join(row["features"])}'
     return f'{row["title"]} {''.join(row["features"])}'

# New col
df_items['preprocessed_data'] = df_items.apply(preprocess_data, axis=1)
df_items.head(2).T
    

Unnamed: 0,0,1
main_category,Industrial & Scientific,All Electronics
title,"RAVODOI USB C Cable, [2Pack/3.3ft+6.6ft] USB T...","SNESH-2 Pack USB-C Female to USB Male Adapter,..."
average_rating,4.4,4.5
rating_number,119,352
features,[【Fast Charging Cord】These USB C cables provid...,[🔹(Light & compact) Easy to carry and light we...
description,[],[]
price,,4.99
images,[{'thumb': 'https://m.media-amazon.com/images/...,[{'thumb': 'https://m.media-amazon.com/images/...
videos,"[{'title': 'Type-C Charger Cable ', 'url': 'ht...","[{'title': 'USB Male & Female Adapter', 'url':..."
store,RAVODOI,SNESH


In [5]:
# Sub sample for dev purpose to make sure everythin works, no need for all data
df_sample = df_items.sample(50, random_state=42)
print(df_sample.shape)

(50, 17)


### Embedding

In [6]:
import openai

# Embedding function
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

# get_embedding("What earbuds can you offer?")

In [7]:
# Embed the data by creating  PointStruct
data_to_embed = df_sample['preprocessed_data'].tolist()
pointstructs = []

# Create a list of PointStructs (~18 s for 50 items, of avg length of 500 tokens)
# i will be id (one row)
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data)
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={"text": data}, # what to return in addition to the embedding vector
        )
    )


In [8]:
pointstructs

[PointStruct(id=0, vector=[0.011265029199421406, -0.014567585662007332, 0.008149511180818081, -0.010393966920673847, -0.04753970727324486, -0.0016192144248634577, -0.04057120904326439, 0.04553038626909256, -0.002382062142714858, -0.009838197380304337, -0.03046046942472458, 0.012665142305195332, -0.02849389798939228, 0.08755514770746231, 0.012462073005735874, 0.005124840419739485, -0.03627467900514603, 0.009693910367786884, -0.01853279024362564, 0.012964403256773949, 0.013423982076346874, 0.013701867312192917, 0.033474452793598175, 0.02922067418694496, 0.018286969512701035, 0.020061157643795013, -0.04057120904326439, -0.02627081796526909, -0.019826024770736694, 0.053353916853666306, -0.013969064690172672, -0.02014666050672531, -0.021600212901830673, -0.051387347280979156, -0.04287979006767273, -0.018244218081235886, -0.017399875447154045, 0.014588961377739906, -0.01949469931423664, -0.001468248083256185, 0.02684796415269375, 0.04176824912428856, 0.009202267974615097, 0.00043252529576420

In [9]:
# Print 
print(pointstructs[0].id)
print(pointstructs[0].vector)
print(pointstructs[0].payload)

0
[0.011265029199421406, -0.014567585662007332, 0.008149511180818081, -0.010393966920673847, -0.04753970727324486, -0.0016192144248634577, -0.04057120904326439, 0.04553038626909256, -0.002382062142714858, -0.009838197380304337, -0.03046046942472458, 0.012665142305195332, -0.02849389798939228, 0.08755514770746231, 0.012462073005735874, 0.005124840419739485, -0.03627467900514603, 0.009693910367786884, -0.01853279024362564, 0.012964403256773949, 0.013423982076346874, 0.013701867312192917, 0.033474452793598175, 0.02922067418694496, 0.018286969512701035, 0.020061157643795013, -0.04057120904326439, -0.02627081796526909, -0.019826024770736694, 0.053353916853666306, -0.013969064690172672, -0.02014666050672531, -0.021600212901830673, -0.051387347280979156, -0.04287979006767273, -0.018244218081235886, -0.017399875447154045, 0.014588961377739906, -0.01949469931423664, -0.001468248083256185, 0.02684796415269375, 0.04176824912428856, 0.009202267974615097, 0.00043252529576420784, -0.0070967548526823

In [10]:
len(pointstructs[0].vector)

1536

### Write to qdrant db

In [11]:
# Create an empty collection
# Size is the dimension of the vector
# It is exactly the same size as what OpenAI's smaller embeddings model returns
# Size and Distance cannot be changed after creation, as it decided the indexes
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-00",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
    )

True

In [12]:
# Upsert (write) the data into the collection
qdrant_client.upsert(
    collection_name="Amazon-items-collection-00",
    points=pointstructs,
    wait=True, # until done
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

### Retrieve data

In [None]:
# qdrant_client.collection_exists(collection_name="Amazon-items-collection-01")

In [13]:
# Make a query

def retrieve_data(query):
    query_embedding = get_embedding(query)
    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-00",
        query = query_embedding,
        limit=10
    )
    return results


In [14]:
# Check
retrieved_data = retrieve_data("what earbuds can I get?")


In [15]:
retrieved_data.points

[ScoredPoint(id=44, version=0, score=0.49486154, payload={'text': 'Wireless Earbuds, Bluetooth 5.3 Headphones with Microphone, 37H Playback LED Power Display, In-Ear Headphones Deep Bass, IPX7 Waterproof, Ultra-Light Earphones with Charging Case, Smart Touch, Sport S23-vine earbuds'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=20, version=0, score=0.4750314, payload={'text': 'Open Ear Headphones, Bluetooth 5.3 Earbuds with 60H Playtime IPX7 Waterproof Wireless Earbuds Immersive Premium Sound True Wireless Open Ear Earbuds with Earhooks for Running, Walking and Workouts 【Open-ear Design Headphones】Feature with a new generation of true open-ear wireless earbuds design, the headphones can rest gently and firmly fit your ears without entering your ear canal, which will reduce stress and hearing loss after extended wear. There is no pinching of the auricle, no blockage of the ear canal, and no pain or damage to hearing.【Powerful Stereo Sound】Equipped with 16.2 millimete

In [16]:
# Some info
print(qdrant_client.embedding_model_name)


BAAI/bge-small-en
