## Deal with (one or relational) quantitative queries on Dandi Search

#### Example

Input: `Are there dandisets that do not contain 2 species but do contain 3 or more measurements?`

Output:
```
DANDI:000251/draft
- Species (1): Mus musculus - House mouse
- Measurement Techniques (3): analytical technique, behavioral technique, spike sorting technique
DANDI:000491/0.230602.1307
- Species (1): Mus musculus - House mouse
- Measurement Techniques (3): two-photon microscopy technique, analytical technique, surgical technique
DANDI:000458/0.230317.0039
- Species (1): Mus musculus - House mouse
- Measurement Techniques (6): surgical technique, spike sorting technique, behavioral technique, analytical technique, signal filtering technique, multi electrode extracellular electrophysiology recording technique
```

In [4]:
from qdrant_client import QdrantClient as Qdrant
from qdrant_client.models import models
import os

# ---------------------

QUERY = "Show me dandisets that have at least 100 files, less than than three measurements, and do not have any species."

# ---------------------


In [5]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain
from langchain.prompts import PromptTemplate

if not QUERY:
    print("Run the above cell with a specific quantitative query.")

supported_quantity_values = ["species", "approaches", "variables", "measurements", "subjects", "bytes", "files", "cells", "samples"]
COMPARISON_OPS = [">=", ">", "<", "<=", "==", "!="]
schema = {
    "description": "Extract one or more triplets containing number of assets, asset type, and corresponding comparison operator.",
    "properties": {
        "number_of_assets": { 
            "type": "number",
            "description": "The number correlated with a specific asset type."
        },
        "asset_type": { 
            "type": "string",
            "description": "The type of asset that correlates with a numerical value.",
            "enum": supported_quantity_values,
        },  
        "comparison_op": { 
            "type": "string",
            "description": "A comparison operator that represents the exact value/range of an asset type.",
            "enum": COMPARISON_OPS,
        },
    },
    "required": ["number_of_assets", "asset_type", "comparison_op"],
}

# use gpt-4 for now
llm = ChatOpenAI(model="gpt-4", max_tokens=150, temperature=0)
chain = create_extraction_chain(
    schema=schema,
    llm=llm,
)
response = list(chain.run(QUERY))
print(response)

[{'number_of_assets': 100, 'asset_type': 'files', 'comparison_op': '>='}, {'number_of_assets': 3, 'asset_type': 'measurements', 'comparison_op': '<'}, {'number_of_assets': 0, 'asset_type': 'species', 'comparison_op': '=='}]


In [6]:
if isinstance(response[0], str):
    print("No quantitative extractions found.")
    exit(1)

# qdrant client
qdrant_client = Qdrant(
    location="https://906c3b3f-d3ff-4497-905f-2d7089487cf9.us-east4-0.gcp.cloud.qdrant.io", 
    port="6333",
    api_key=os.environ.get("QDRANT_API_KEY"),
)

# determine comparison operator
def get_condition(key: str, comparison_op: str, num_asset: int) -> (models.FieldCondition, bool):
    is_must = True
    if comparison_op == "==":
        match = models.MatchValue(value=num_asset)
        condition = models.FieldCondition(key=key, match=match)
    elif comparison_op == "!=":
        match = models.MatchValue(value=num_asset)
        condition = models.FieldCondition(key=key, match=match)
        is_must = False
    elif comparison_op == ">=":
        range = models.Range(gte=float(num_asset))
        condition = models.FieldCondition(key=key, range=range)
    elif comparison_op == ">":
        range = models.Range(gt=float(num_asset))
        condition = models.FieldCondition(key=key, range=range)
    elif comparison_op == "<=":
        range = models.Range(lte=float(num_asset))
        condition = models.FieldCondition(key=key, range=range)
    elif comparison_op == "<":
        range = models.Range(lt=float(num_asset))
        condition = models.FieldCondition(key=key, range=range)
    else:
        condition = None
    return condition, is_must

# get matches
asset_types = []
must_conditions = []
must_not_conditions = []
for i, extraction in enumerate(response):
    asset_type = extraction.get("asset_type", None)
    if asset_type == "measurements":
        asset_type = "measurement_techniques"
    elif asset_type == "variables":
        asset_type = "variables_measured"
    asset_types.append(asset_type)
    number_of_assets = extraction.get("number_of_assets", None)
    comparison_op = extraction.get("comparison_op", None)
    if not asset_type or number_of_assets is None or not comparison_op:
        print(f"Triplet #{i} skipped (due to None value).")
        continue
    key = f"number_of_{asset_type.strip()}"
    condition, is_must = get_condition(key=key, comparison_op=comparison_op.strip(), num_asset=number_of_assets)
    if is_must:
        must_conditions.append(condition)
    else:
        must_not_conditions.append(condition)

# qdrant filter
filter = models.Filter(
    must=must_conditions if must_conditions else None, 
    must_not=must_not_conditions if must_not_conditions else None,
)

# query similar results based on filter
docs = qdrant_client.scroll("dandi_collection_ada002", scroll_filter=filter, limit=10, with_vectors=False, with_payload=True)[0]
print("QUERY:", QUERY)
print("-----")
if not docs:
    print("No relevant dandisets found.")
else:
    for doc in docs:
        pl = doc.payload
        print(f"DANDI:{pl['dandiset_id']}/{pl['dandiset_version']}")
        for asset in asset_types:
            key = f"number_of_{asset}"
            asset_title = asset.title()
            if asset in ["measurement_techniques", "variables_measured"]:
                asset_title = " ".join(asset_title.split("_"))
            if asset in ["species", "approaches", "measurement_techniques", "variables_measured"]:
                print(f"- {asset_title} ({pl[key]}):", ", ".join(pl[asset]))
            else:
                print(f"- {asset_title}: ({pl[key]})")

Triplet #1 has invalid asset type selected (LLM error).
QUERY: Show me dandisets that have at least 100 files, less than than three measurements, and do not have any species.
-----
DANDI:000016/draft
- Files: (135)
- Measurement Techniques (1): behavioral technique
- Species (0): 
DANDI:000117/draft
- Files: (197)
- Measurement Techniques (2): current clamp technique, voltage clamp technique
- Species (0): 
