### Run qdrant 

In [None]:
# Locall deploy qdrant db (wsl)

# 1. Pull the latest qdrant image
# docker pull qdrant/qdrant

# 2. Run the container, bind the ports and volume to the current directory
# docker run -p 6333:6333 -p 6334:6334 \
#     -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
#     qdrant/qdrant

# Access web UI at http://localhost:6333/dashboard

# Issue:
# After docker compose was created i couldn't retrieve from 00 collection, 
# needed to create new / or reopload all data to 00 to be able to retrieve from it
# Data uploadedd via notebook is not accessible from the container.
# To be accessible, we need to upload it via docker compose (see upsert.py) 
# each time we start the container.
# Solution:
# Run everything (docker compose, notebook, project store etc.) from within wsl.

In [None]:
# Import qdrant client
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

In [None]:
# Initialize qdrant client pointing to the localhost through rest api
qdrant_client = QdrantClient(url="http://localhost:6333")
# Dashboard UI: http://localhost:6333/dashboard

### Pre-process data

In [None]:
import pandas as pd

df_items = pd.read_json('../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl', lines=True)
df_items.head(2).T

In [None]:
# We will use & embeddg into the vector db 3 columns, containing most info about the procuct: 
# title, decription and features
# We need to concatinate them into a single string, and then embedd it

def preprocess_data(row):
    # return f'{row["title"]} {''.join(row["description"])} {''.join(row["features"])}'
     return f'{row["title"]} {''.join(row["features"])}'

# New col
df_items['preprocessed_data'] = df_items.apply(preprocess_data, axis=1)
df_items.head(2).T
    

In [None]:
# Sub sample for dev purpose to make sure everythin works, no need for all data
df_sample = df_items.sample(50, random_state=42)
print(df_sample.shape)

### Embedding

In [None]:
import openai

# Embedding function
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

# get_embedding("What earbuds can you offer?")

In [None]:
# Embed the data by creating  PointStruct
data_to_embed = df_sample['preprocessed_data'].tolist()
pointstructs = []

# Create a list of PointStructs (~18 s for 50 items, of avg length of 500 tokens)
# i will be id (one row)
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data)
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={"text": data}, # what to return in addition to the embedding vector
        )
    )


In [None]:
pointstructs

In [None]:
# Print 
print(pointstructs[0].id)
print(pointstructs[0].vector)
print(pointstructs[0].payload)

In [None]:
len(pointstructs[0].vector)

### Write to qdrant db

In [None]:
# Create an empty collection
# Size is the dimension of the vector
# It is exactly the same size as what OpenAI's smaller embeddings model returns
# Size and Distance cannot be changed after creation, as it decided the indexes
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-00",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
    )

In [None]:
# Upsert (write) the data into the collection
qdrant_client.upsert(
    collection_name="Amazon-items-collection-00",
    points=pointstructs,
    wait=True, # until done
)

### Retrieve data

In [None]:
# qdrant_client.collection_exists(collection_name="Amazon-items-collection-01")

In [None]:
# Make a query

def retrieve_data(query):
    query_embedding = get_embedding(query)
    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-00",
        query = query_embedding,
        limit=10
    )
    return results


In [None]:
# Check
retrieved_data = retrieve_data("what earbuds can I get?")


In [None]:
retrieved_data.points

In [None]:
# Some info
print(qdrant_client.embedding_model_name)
