## Deal with quantitative queries on Dandi Search

In [1]:
# Basic: Deal with finding dandisets with 2 or more species. Filter via Qdrnat api
from qdrant_client import QdrantClient as Qdrant
from qdrant_client.models import models
import os

# ---------------------

QUERY = "Are there any dandisets that contain no more than 2 species?"

#Ex: Which dandisets have more than X subjects/neurons from brain region Y?
#Ex: Show me dandisets with the most species.

# ---------------------


In [2]:
# Approach #1: LLM
from langchain.llms.openai import OpenAIChat
import json

template = """
OBJECTIVE: 
Extract a numerical value and its corresponding asset type from a user query while ensuring the asset type is one of the following: species, scientific_approach, variables_measured, or measurement_technique. 
Additionally, select a comparison operator (>=, >, <, <=, or ==) that best represents the number of assets to be retrieved.

DESIRED OUTPUT FORMAT (JSON):
- "number_of_assets": [int - numerical value],
- "asset_type": [string - type of asset],
- "comparison_op": [string - type of comparison operator]

USER QUERY:
{}
""".format(QUERY)

llm = OpenAIChat(model_name="gpt-3.5-turbo", max_tokens=200, temperature=0)
response = llm(template)
response_json = json.loads(response)
print(response_json)



{'number_of_assets': 2, 'asset_type': 'species', 'comparison_op': '<='}


In [3]:
import sys

# qdrant client
qdrant_client = Qdrant(
    location="https://906c3b3f-d3ff-4497-905f-2d7089487cf9.us-east4-0.gcp.cloud.qdrant.io", 
    port="6333",
    api_key=os.environ.get("QDRANT_API_KEY"),
)

# filter
num_asset: int = response_json.get("number_of_assets", None)
asset_type: str = response_json.get("asset_type", None)
comparison_op: str = response_json.get("comparison_op", None)

if not num_asset or not asset_type:
    print("Values not found.")
    sys.exit(0)
if not comparison_op:
    # default to exact value
    comparison_op = "=="

# determine asset type
if asset_type == "species":
    key = "number_of_species"
elif asset_type == "scientific_approach":
    key = "number_of_approaches"
elif asset_type == "variables_measured":
    key = "number_of_variables_measured"
elif asset_type == "measurement_techniques":
    key = "number_of_measurement_techniques"
else:
    key = None
if not key:
    print("Valid key not found")
    sys.exit(0)

# determine comparison opeartor
if comparison_op == "==":
    match = models.MatchValue(value=num_asset)
elif comparison_op == ">=":
    match = models.MatchAny(any=list(range(num_asset, 151)))
elif comparison_op == ">":
    match = models.MatchAny(any=list(range(num_asset + 1, 151)))
elif comparison_op == "<=":
    match = models.MatchAny(any=list(range(0, num_asset + 1)))
elif comparison_op == "<":
    match = models.MatchAny(any=list(range(0, num_asset)))
else:
    match = None

# qdrant filter
filter = models.Filter(must=[models.FieldCondition(key=key, match=match)])

# query similar results based on filter
docs = qdrant_client.scroll("dandi_collection_ada002", scroll_filter=filter, limit=100, with_vectors=False, with_payload=True)[0]
if not docs:
    print("No relevant dandisets found.")
else:
    for doc in docs:
        pl = doc.payload
        print(f"DANDI:{pl['dandiset_id']}/{pl['dandiset_version']}")
        for item in pl["species"]:
            print(f"- {item}")

Values not found.


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
