## Deal with (one or relational) quantitative queries on Dandi Search

In [1]:
# Basic: Deal with finding dandisets with 2 or more species. Filter via Qdrnat api
from qdrant_client import QdrantClient as Qdrant
from qdrant_client.models import models
import os

# ---------------------

QUERY = "Are there dandisets that contain 2 or more species and exactly 1 measurement?"

# ---------------------


In [2]:
# Approach #1: LLM
from langchain.llms.openai import OpenAIChat
import json

SPECIES = "species"
SCIENTIFIC_APPROACHES = "scientific_approaches"
VARIABLES_MEASURED = "varirables_measured"
MEASUREMENT_TECHNIQUES = "measurement_techniques"

template = """
OBJECTIVE: 
Extract a numerical value and its corresponding asset type from a user query. 
The asset type must be one of the following: {}, {}, {}, or {}. 
If multiple numerical value and asset type pairs are identified, generate a list of triplets, each containing the numerical value, asset type, and comparison operator. 
The comparison operator (>=, >, <, <=, or ==) should best represent the number of assets to be retrieved. 
If either the numerical value or asset type is missing, do not create a triplet for that instance.

DESIRED OUTPUT FORMAT (python list of triplets):
- "number_of_assets": [int - numerical value],
- "asset_type": [string - type of asset],
- "comparison_op": [string - type of comparison operator]

USER QUERY:
{}
""".format(SPECIES, SCIENTIFIC_APPROACHES, VARIABLES_MEASURED, MEASUREMENT_TECHNIQUES, QUERY)

llm = OpenAIChat(model_name="gpt-3.5-turbo", max_tokens=200, temperature=0)
response = llm(template)
response_json = json.loads(response)
print(response_json)



[{'number_of_assets': 2, 'asset_type': 'species', 'comparison_op': '>='}, {'number_of_assets': 1, 'asset_type': 'measurement_techniques', 'comparison_op': '=='}]


In [6]:
import sys

if not response_json:
    print("No quantitive extractions found.")
    sys.exit(0)

# qdrant client
qdrant_client = Qdrant(
    location="https://906c3b3f-d3ff-4497-905f-2d7089487cf9.us-east4-0.gcp.cloud.qdrant.io", 
    port="6333",
    api_key=os.environ.get("QDRANT_API_KEY"),
)

# determine asset type
def get_filter_key(asset_type: str):
    if asset_type == SPECIES:
        key = "number_of_species"
    elif asset_type == SCIENTIFIC_APPROACHES:
        key = "number_of_approaches"
    elif asset_type == VARIABLES_MEASURED:
        key = "number_of_variables_measured"
    elif asset_type == MEASUREMENT_TECHNIQUES:
        key = "number_of_measurement_techniques"
    else:
        key = None
    return key

# determine comparison operator
def get_condition(key: str, comparison_op: str, num_asset: int):
    if comparison_op == "==":
        match = models.MatchValue(value=num_asset)
        condition = models.FieldCondition(key=key, match=match)
    elif comparison_op == ">=":
        range = models.Range(gte=float(num_asset))
        condition = models.FieldCondition(key=key, range=range)
    elif comparison_op == ">":
        range = models.Range(gt=float(num_asset))
        condition = models.FieldCondition(key=key, range=range)
    elif comparison_op == "<=":
        range = models.Range(lte=float(num_asset))
        condition = models.FieldCondition(key=key, range=range)
    elif comparison_op == "<":
        range = models.Range(lt=float(num_asset))
        condition = models.FieldCondition(key=key, range=range)
    else:
        condition = None
    return condition

# get matches
conditions = []
for i, extraction in enumerate(response_json):
    asset_type = extraction.get("asset_type", None)
    number_of_assets = extraction.get("number_of_assets", None)
    comparison_op = extraction.get("comparison_op", None)
    if not asset_type or not number_of_assets or not comparison_op:
        print(f"Triplet #{i} skipped (due to None value).")
        continue

    key = get_filter_key(asset_type=asset_type)
    condition = get_condition(key=key, comparison_op=comparison_op, num_asset=number_of_assets)
    conditions.append(condition)

# qdrant filter
filter = models.Filter(must=conditions)

# query similar results based on filter
docs = qdrant_client.scroll("dandi_collection_ada002", scroll_filter=filter, limit=10, with_vectors=False, with_payload=True)[0]
print("QUERY:", QUERY)
print("-----")
if not docs:
    print("No relevant dandisets found.")
else:
    for doc in docs:
        pl = doc.payload
        print(f"DANDI:{pl['dandiset_id']}/{pl['dandiset_version']}")
        print(f"- Species ({pl['number_of_species']}):", ", ".join(pl["species"]))
        print(f"- Approaches ({pl['number_of_approaches']}):", ", ".join(pl["approaches"]))
        print(f"- Measurement Techniques ({pl['number_of_measurement_techniques']}):", ", ".join(pl["measurement_techniques"]))
        print(f"- Variables Measured ({pl['number_of_variables_measured']}):", ", ".join(pl["variables_measured"]))

ValidationError: 32 validation errors for FieldCondition
match -> value
  field required (type=value_error.missing)
match -> geo_bounding_box
  extra fields not permitted (type=value_error.extra)
match -> geo_polygon
  extra fields not permitted (type=value_error.extra)
match -> geo_radius
  extra fields not permitted (type=value_error.extra)
match -> key
  extra fields not permitted (type=value_error.extra)
match -> match
  extra fields not permitted (type=value_error.extra)
match -> range
  extra fields not permitted (type=value_error.extra)
match -> values_count
  extra fields not permitted (type=value_error.extra)
match -> text
  field required (type=value_error.missing)
match -> geo_bounding_box
  extra fields not permitted (type=value_error.extra)
match -> geo_polygon
  extra fields not permitted (type=value_error.extra)
match -> geo_radius
  extra fields not permitted (type=value_error.extra)
match -> key
  extra fields not permitted (type=value_error.extra)
match -> match
  extra fields not permitted (type=value_error.extra)
match -> range
  extra fields not permitted (type=value_error.extra)
match -> values_count
  extra fields not permitted (type=value_error.extra)
match -> any
  field required (type=value_error.missing)
match -> geo_bounding_box
  extra fields not permitted (type=value_error.extra)
match -> geo_polygon
  extra fields not permitted (type=value_error.extra)
match -> geo_radius
  extra fields not permitted (type=value_error.extra)
match -> key
  extra fields not permitted (type=value_error.extra)
match -> match
  extra fields not permitted (type=value_error.extra)
match -> range
  extra fields not permitted (type=value_error.extra)
match -> values_count
  extra fields not permitted (type=value_error.extra)
match -> except
  field required (type=value_error.missing)
match -> geo_bounding_box
  extra fields not permitted (type=value_error.extra)
match -> geo_polygon
  extra fields not permitted (type=value_error.extra)
match -> geo_radius
  extra fields not permitted (type=value_error.extra)
match -> key
  extra fields not permitted (type=value_error.extra)
match -> match
  extra fields not permitted (type=value_error.extra)
match -> range
  extra fields not permitted (type=value_error.extra)
match -> values_count
  extra fields not permitted (type=value_error.extra)