### Prepare Dandi Archive metadata for Qdrant vector embeddings

- Customize the Dandiset blacklist as needed
- Run the script to filter and process Dandiset metadata

In [1]:
# -----------------------------

dandiset_blacklist = [
    "000545",
    "000470",
    "000411",
    "000529",
    "000299",
    "000029",
    "000027",
    "000126",
    "000544",
    "000068",
]

# -----------------------------

import sys
import os

notebook_path = os.path.abspath('.')
sys.path.append(os.path.join(notebook_path, '..'))

from rest.clients.dandi import DandiClient
from rest.clients.openai import OpenaiClient
from rest.clients.llama2 import Llama2Client
from rest.clients.qdrant import QdrantClient
from rest.constants import OPENAI_COLLECTION_NAME, LLAMA2_COLLECTION_NAME

dandi_client = DandiClient()
openai_client = OpenaiClient()
# llama_client = Llama2Client()
qdrant_client = QdrantClient(host="https://906c3b3f-d3ff-4497-905f-2d7089487cf9.us-east4-0.gcp.cloud.qdrant.io")

all_metadata = dandi_client.get_all_dandisets_metadata()
all_metadata_formatted: list[dict] = dandi_client.collect_relevant_metadata(metadata_list=all_metadata)
print("START: Number of items:", len(all_metadata_formatted))

filtered_all_metadata_formatted = []
for i, dandiset in enumerate(all_metadata_formatted):
    if not any(item in str(dandiset["dandiset_id"]).split(":")[-1] for item in dandiset_blacklist):
        filtered_all_metadata_formatted.append(dandiset)
    else:
        print(f"REMOVED -- {dandiset['dandiset_id']}: {dandiset['title']}")

num_not_removed = len(dandiset_blacklist) - (len(all_metadata_formatted) - len(filtered_all_metadata_formatted))
if num_not_removed:
    print(f"NOTE: {num_not_removed} blacklisted dandiset(s) not removed.")

print("END: Number of items:", len(filtered_all_metadata_formatted))

A newer version (0.58.0) of dandi/dandi-cli is available. You are using 0.56.2


START: Number of items: 178
REMOVED -- 000027: Test dataset for testing dandi-cli.
REMOVED -- 000029: Test dataset for development purposes
REMOVED -- 000126: NWB API Test Data
REMOVED -- 000299: Stephen Test Set
REMOVED -- 000411: test
REMOVED -- 000470: Test
REMOVED -- 000529: Test 2
REMOVED -- 000544: Test Dataset
REMOVED -- 000545: Test set
NOTE: 1 blacklisted dandiset(s) not removed.
END: Number of items: 169


### Overwrite current Qdrant points in `data/qdrant_points.json`

- Make sure to run the previous script to filter Dandiset metadata before running this script
- Choose a valid embedding model for which the qdrant points should be generated/updated (or leave model field empty to update all model qdrant points)
- Run the script to retrieve embeddings and save them to `data/qdrant_points.json`.

In [2]:
# -----------------------------

# choose one of: "ada002" or "llama2" or leave empty to update all model qdrant points
model = "ada002"

# -----------------------------

def get_embeddings(client, metadata):
    return client.get_embeddings(
        metadata_list=metadata,
        save_to_file=True
    )

if model == "ada002":
    emb = get_embeddings(openai_client, filtered_all_metadata_formatted)
    qdrant_client.create_collection(collection_name=OPENAI_COLLECTION_NAME)
    qdrant_client.add_points_to_collection(collection_name=OPENAI_COLLECTION_NAME, embeddings_objects=emb)
    print("Number of emb items:", len(emb))
# elif model == "llama2":
#     emb = get_embeddings(llama_client, filtered_all_metadata_formatted)
#     qdrant_client.create_collection(collection_name=LLAMA2_COLLECTION_NAME)
#     qdrant_client.add_points_to_collection(collection_name=LLAMA2_COLLECTION_NAME, embeddings_objects=emb)
#     print("Number of emb items:", len(emb))
elif not model:
    emb1 = get_embeddings(openai_client, filtered_all_metadata_formatted)
    # emb2 = get_embeddings(llama_client, filtered_all_metadata_formatted)
    print("Number of openai emb items:", len(emb1))
    # print("Number of llama2 emb items:", len(emb2))

All points added to collection dandi_collection_ada002
Number of emb items: 169
