### Prepare Dandi Archive metadata for Qdrant vector embeddings

- Customize the Dandiset blacklist as needed
- Run the script to filter and process Dandiset metadata

In [1]:
# -----------------------------

dandiset_blacklist = [
    "000545",
    "000470",
    "000411",
    "000529",
    "000299",
    "000029",
    "000027",
    "000126",
    "000544",
    "000068",
]

# -----------------------------

import sys
import os

notebook_path = os.path.abspath('.')
sys.path.append(os.path.join(notebook_path, '..'))

from rest.clients.dandi import DandiClient
from rest.clients.openai import OpenaiClient
from rest.clients.llama2 import Llama2Client

dandi_client = DandiClient()
openai_client = OpenaiClient()
llama_client = Llama2Client()

all_metadata = dandi_client.get_all_dandisets_metadata()
all_metadata_formatted: list[dict] = dandi_client.collect_relevant_metadata(metadata_list=all_metadata)
print("START: Number of items:", len(all_metadata_formatted))

filtered_all_metadata_formatted = []
for i, dandiset in enumerate(all_metadata_formatted):
    if not any(item in str(dandiset["dandiset_id"]).split(":")[-1] for item in dandiset_blacklist):
        filtered_all_metadata_formatted.append(dandiset)
    else:
        print(f"REMOVED -- {dandiset['dandiset_id']}: {dandiset['title']}")

num_not_removed = len(dandiset_blacklist) - (len(all_metadata_formatted) - len(filtered_all_metadata_formatted))
if num_not_removed:
    print(f"NOTE: {num_not_removed} blacklisted dandiset(s) not removed.")

print("END: Number of items:", len(filtered_all_metadata_formatted))

A newer version (0.58.0) of dandi/dandi-cli is available. You are using 0.56.2


ValidationError: 1 validation error for LlamaCppEmbeddings
__root__
  Could not load Llama model from path: /Users/jai/dandi-search-response-ui/bin/model.bin. Received error Model path does not exist: /Users/jai/dandi-search-response-ui/bin/model.bin (type=value_error)

### Overwrite current Qdrant points in `data/qdrant_points.json`

- Make sure to run the previous script to filter Dandiset metadata before running this script
- Choose a valid embedding model for which the qdrant points should be generated/updated (or leave model field empty to update all model qdrant points)
- Run the script to retrieve embeddings and save them to `data/qdrant_points.json`.

In [None]:
# -----------------------------

# choose one of: "ada002" or "llama2" or leave empty to update all model qdrant points
model = ""

# -----------------------------

def get_embeddings(client, metadata):
    return client.get_embeddings(
        metadata_list=metadata,
        save_to_file=True
    )

if model == "ada002":
    emb = get_embeddings(openai_client, filtered_all_metadata_formatted)
    print("Number of emb items:", len(emb))
elif model == "llama2":
    emb = get_embeddings(llama_client, filtered_all_metadata_formatted)
    print("Number of emb items:", len(emb))
elif not model:
    emb1 = get_embeddings(openai_client, filtered_all_metadata_formatted)
    emb2 = get_embeddings(llama_client, filtered_all_metadata_formatted)
    print("Number of openai emb items:", len(emb1))
    print("Number of llama2 emb items:", len(emb2))

Number of emb items: 164
