### Prepare Dandi Archive metadata for Qdrant vector embeddings

- Customize the Dandiset blacklist as needed
- Run the script to filter and process Dandiset metadata

In [10]:
# -----------------------------

dandiset_blacklist = [
    "000545",
    "000470",
    "000411",
    "000529",
    "000299",
    "000029",
    "000027",
    "000126",
    "000544",
    "000068",
]

# -----------------------------

import sys
import os

notebook_path = os.path.abspath('.')
sys.path.append(os.path.join(notebook_path, '..'))

from rest.clients.dandi import DandiClient
from rest.clients.openai import OpenaiClient
from rest.clients.embedding import EmbeddingClient
from rest.clients.qdrant import QdrantClient
from rest.constants import OPENAI_COLLECTION_NAME, EMB_COLLECTION_NAME, OPENAI_VECTOR_SIZE

import rest.constants

dandi_client = DandiClient()
openai_client = OpenaiClient()
emb_client = EmbeddingClient()
qdrant_client = QdrantClient(host="https://906c3b3f-d3ff-4497-905f-2d7089487cf9.us-east4-0.gcp.cloud.qdrant.io")

all_metadata = dandi_client.get_all_dandisets_metadata()
all_metadata_formatted: list[dict] = dandi_client.collect_relevant_metadata(metadata_list=all_metadata)
print("START: Number of items:", len(all_metadata_formatted))

filtered_all_metadata_formatted = []
for i, dandiset in enumerate(all_metadata_formatted):
    if not any(item in str(dandiset["dandiset_id"]).split(":")[-1] for item in dandiset_blacklist):
        filtered_all_metadata_formatted.append(dandiset)
    else:
        print(f"REMOVED -- {dandiset['dandiset_id']}: {dandiset['title']}")

num_not_removed = len(dandiset_blacklist) - (len(all_metadata_formatted) - len(filtered_all_metadata_formatted))
if num_not_removed:
    print(f"NOTE: {num_not_removed} blacklisted dandiset(s) not removed.")

print("END: Number of items:", len(filtered_all_metadata_formatted))

START: Number of items: 180
REMOVED -- 000027: Test dataset for testing dandi-cli.
REMOVED -- 000029: Test dataset for development purposes
REMOVED -- 000126: NWB API Test Data
REMOVED -- 000299: Stephen Test Set
REMOVED -- 000411: test
REMOVED -- 000470: Test
REMOVED -- 000529: Test 2
REMOVED -- 000544: Test Dataset
REMOVED -- 000545: Test set
NOTE: 1 blacklisted dandiset(s) not removed.
END: Number of items: 171


In [11]:
# Update embedding model vector size
emb = emb_client.get_embedding_simple("test")
emb_length = len(emb)
print("Vector Size:", emb_length)
rest.constants.EMB_VECTOR_SIZE = emb_length

get_embedding_simple reached.
[0.01697487197816372, -0.013383402489125729, 0.016696885228157043, 0.029201770201325417, -0.024102969095110893, -0.004468169063329697, -0.014743867330253124, 0.02777334675192833, 0.013282950967550278, 0.0425858236849308, 0.028490005061030388, -0.012756512500345707, 0.0022535803727805614, -0.04957007244229317, -0.012542041949927807, -0.005449279211461544, -0.004064447712153196, -0.025642884895205498, 0.0038107975851744413, 0.005721841473132372, -0.002670586807653308, 0.020733201876282692, -0.04501507058739662, 0.015136895701289177, -0.031634487211704254, 0.01824611984193325, 0.0007100059883669019, -0.004022526554763317, 0.03952464461326599, 0.04959387332201004, -0.039904288947582245, -0.03303002938628197, 0.016823219135403633, -0.0633615031838417, -0.02057638391852379, -0.011417304165661335, 0.02897271327674389, -0.06497505307197571, -0.021642398089170456, -0.03868752345442772, 0.0009819763945415616, -0.0016137653728947043, 0.029350442811846733, -0.01194186

### Overwrite current Qdrant points

- Make sure to run the previous script to filter Dandiset metadata before running this script
- Choose a valid embedding model for which the qdrant points should be generated/updated (or leave model field empty to update all model qdrant points)
- Run the script to retrieve embeddings, save them to `data/qdrant_points.json`, and update Qdrant collection embeddings

In [12]:
# -----------------------------

# choose one of: "ada002" or "emb" or leave empty to update all model qdrant points
model = ""

# -----------------------------

def get_embeddings(client, metadata):
    return client.get_embeddings(
        metadata_list=metadata,
        save_to_file=True
    )

if model == "ada002":
    emb = get_embeddings(openai_client, filtered_all_metadata_formatted)
    qdrant_client.update_collection(collection_name=OPENAI_COLLECTION_NAME, emb=emb, vector_size=OPENAI_VECTOR_SIZE)
    print("Number of emb items:", len(emb))
elif model == "emb":
    emb = get_embeddings(emb_client, filtered_all_metadata_formatted)
    qdrant_client.update_collection(collection_name=EMB_COLLECTION_NAME, emb=emb, vector_size=rest.constants.EMB_VECTOR_SIZE)
    print("Number of emb items:", len(emb))
elif not model:
    emb1 = get_embeddings(openai_client, filtered_all_metadata_formatted)
    qdrant_client.update_collection(collection_name=OPENAI_COLLECTION_NAME, emb=emb1, vector_size=OPENAI_VECTOR_SIZE)
    print("Number of ada002 emb items:", len(emb1))
    
    emb2 = get_embeddings(emb_client, filtered_all_metadata_formatted)
    qdrant_client.update_collection(collection_name=EMB_COLLECTION_NAME, emb=emb2, vector_size=rest.constants.EMB_VECTOR_SIZE)
    print("Number of model emb items:", len(emb2))

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..


All points added to collection dandi_collection_ada002
Number of ada002 emb items: 171
All points added to collection dandi_collection_emb
Number of model emb items: 171
