## Deal with quantitative queries on Dandi Search

In [1]:
# Basic: Deal with finding dandisets with 2 or more species. Filter via Qdrnat api
from qdrant_client import QdrantClient as Qdrant
from qdrant_client.models import models
import os

# ---------------------

QUERY = "Are there any dandisets that contain 2 or more species?"

#Ex: Which dandisets have more than X subjects/neurons from brain region Y?
#Ex: Show me dandisets with the most species.

# ---------------------


In [None]:
# Approach #1: Extraction chain
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain
species_schema = {
    "properties": {
        "number_of_asset": {
            "type": "integer",
            "description": "Numerical value representing the number of a specific asset",
        },
        "asset_type": {
            "type": "string",
            "description": "One of: species, approach, variable measured, measurement technique",
        },
        "comparison_operator": {
            "type": "string",
            "description": "Comparison operator for the number of assets (e.g. 'greater than', 'less', 'more')"
        }
    },
    "required": ["number_of_asset", "asset_type"],
}
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
chain = create_extraction_chain(schema=species_schema, llm=llm)
extracted = chain.run(QUERY)
print(extracted)

In [4]:
# Approach #2: LLM

from langchain.prompts import PromptTemplate
from langchain.llms.openai import OpenAI
from langchain.chains import LLMChain

template = """
Your task is to extract a numerical value that represents a specific asset along with the asset type itself from a given user query.
An asset type can only be one of: species, approach, variables_measured, measurement_technique

JSON Output Format:
{
    'number_of_asset': [int - numerical value],
    'asset_type': [string - type of asset]
}

USER QUERY:
{query}
"""
prompt = PromptTemplate(input_variables=["query"], template=template)
llm = OpenAI(model="gpt-3.5-turbo", max_tokens=200, temperature=0)
chain = LLMChain(llm=llm, prompt=prompt)
response = chain.run(query)
print(response)

ValidationError: 1 validation error for PromptTemplate
__root__
  Invalid prompt schema; check for mismatched or missing input parameters. "\n    'number_of_asset'" (type=value_error)

In [30]:
# qdrant client
qdrant_client = Qdrant(
    location="https://906c3b3f-d3ff-4497-905f-2d7089487cf9.us-east4-0.gcp.cloud.qdrant.io", 
    port="6333",
    api_key=os.environ.get("QDRANT_API_KEY"),
)

# filter
num_asset: int = extracted[0].get("number_of_asset", None)
asset_type: str = extracted[0].get("asset_type", None)

if not num_asset or not asset_type:
    import sys
    print("Extraction chain empty.")
    sys.exit(0)

species_keywords = ["species", "specie", "animals", "animal"]
if any(kw == asset_type.strip() for kw in species_keywords):
    key = "number_of_species"

comparison_operator = extracted[0].get("comparison_operator", None)
gt_keywords = ["more", "greater than"]
lt_keywords = ["less", "less than"]
gte_keywords = ["at least", "or more"]
lte_keywords = []
if comparison_operator:
    if any(kw == comparison_operator.strip() for kw in gt_keywords):
        # more
        pass
    elif any(kw == comparison_operator.strip() for kw in lt_keywords):
        # less
        pass
else:
    match = models.MatchValue(value=num_asset)
    models.Match


filter = models.Filter(
    must=[
        models.FieldCondition(key=key, match=models.MatchValue(value=num_asset)),
    ]
)

# query similar results based on filter
docs = qdrant_client.scroll("dandi_collection_ada002", scroll_filter=filter, limit=5, with_vectors=False, with_payload=True)[0]
if not docs:
    print("No relevant dandisets found.")
else:
    for doc in docs:
        pl = doc.payload
        print(f"DANDI:{pl['dandiset_id']}/{pl['dandiset_version']}")
        for item in pl["species"]:
            print(f"- {item}")
        print("\n")

No relevant dandisets found.
