### Prepare Dandi Archive metadata for Qdrant vector embeddings

- Customize the Dandiset blacklist as needed
- Run the script to filter and process Dandiset metadata

In [1]:
# -----------------------------

dandiset_blacklist = [
    "000545",
    "000470",
    "000411",
    "000529",
    "000299",
    "000029",
    "000027",
    "000126",
    "000544",
    "000482",
    "000068",
]

# -----------------------------

import sys
import os

notebook_path = os.path.abspath('.')
sys.path.append(os.path.join(notebook_path, '..'))

from rest.clients.dandi import DandiClient
from rest.clients.openai import OpenaiClient

dandi_client = DandiClient()
openai_client = OpenaiClient()

all_metadata = dandi_client.get_all_dandisets_metadata()
all_metadata_formatted: list[dict] = dandi_client.collect_relevant_metadata(metadata_list=all_metadata)
print("START: Number of items:", len(all_metadata_formatted))

filtered_all_metadata_formatted = []
for i, dandiset in enumerate(all_metadata_formatted):
    if not any(item in str(dandiset["dandiset_id"]).split(":")[-1] for item in dandiset_blacklist):
        filtered_all_metadata_formatted.append(dandiset)
    else:
        print(f"REMOVED -- {dandiset['dandiset_id']}: {dandiset['title']}")

num_not_removed = len(dandiset_blacklist) - (len(all_metadata_formatted) - len(filtered_all_metadata_formatted))
if num_not_removed:
    print(f"NOTE: {num_not_removed} blacklisted dandiset(s) not removed.")

print("END: Number of items:", len(filtered_all_metadata_formatted))

A newer version (0.58.0) of dandi/dandi-cli is available. You are using 0.56.2


START: Number of items: 174
REMOVED -- 000029: Test dataset for development purposes
REMOVED -- 000027: Test dataset for testing dandi-cli.
REMOVED -- 000126: NWB API Test Data
REMOVED -- 000299: Stephen Test Set
REMOVED -- 000411: test
REMOVED -- 000470: Test
REMOVED -- 000482: State-dependent processing in visual cortex
REMOVED -- 000529: Test 2
REMOVED -- 000544: Test Dataset
REMOVED -- 000545: Test set
NOTE: 1 blacklisted dandiset(s) not removed.
END: Number of items: 164


### Overwrite current Qdrant points in `data/qdrant_points.json`

- Make sure to run the previous script to filter Dandiset metadata before running this script
- Run the script to retrieve embeddings and save them to `data/qdrant_points.json`.

In [2]:
# overwrite qdrant_points.json
emb = openai_client.get_embeddings(
    metadata_list=filtered_all_metadata_formatted,
    save_to_file=True
)
print("Number of emb items:", len(emb))

Number of emb items: 164
