# 1. Handle single step queries e.g. 
- [x] "What are the targets of vorinostat?"
- [x] "Find drugs that are used for treating ulcerative colitis." etc.
# 2. Handle 2-step queries e.g. 
- [x] "Which diseases are associated with the genes targetted by fasudil?"
- [ ] "Show all the diseases that have at least 5 pathways associated with Alzheimer"
  - NOTE: I do not think graphql supports carrying this out in a single operation given the open targets schema does not provide any resolvers or function/field to enable us to link pathways back to targets or to count entities based on this association from disease->gene->pathway and use that aggregation as part of a filter. This could be carried out as a multi-query by the client, but that seems to be beyond the intended scope of this exercise. Implementing that with natural human language interpretation by GPT would likely also require us to ask gpt to help us write code to interpret the results of a possibly infinite number of asks that cannot be fulfilled by the current open targets schema in a single query, and somehow ensuring that executing that functionality in the current python session would work correctly. Implementing something like that would take a bit longer than I think this task was intended to take, so I am going to leave this particular feature unimplemented, and make sure the script has the capacity to report when the constructed queries cause an error (as this one does).

In [1]:
import os
import sys
import re
import json
import requests
import openai
from datetime import datetime
from utils import extract_values
from urlpath import URL

In [2]:
opentargets_base_url = "https://api.platform.opentargets.org/api/v4/graphql"
open_targets_base = URL(opentargets_base_url)
open_targets_schema_url = open_targets_base / "schema"
schema_response = open_targets_schema_url.get()
open_targets_schema = schema_response.content.decode()

In [3]:
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [4]:
models = openai.Model.list()
model_ids = []
for item in models['data']:
    model_ids.append(item['id'])
sorted(model_ids)

In [5]:
# gpt_model = 'gpt-3.5-turbo-16k'
gpt_model = 'gpt-4'

In [32]:
# Open Targets graphQL schema example
# read from file
with open("graphql_schema.txt", "r") as f:
    prompt_templates = f.read()

In [37]:
def generate_query_for_opentargets_using_human_language(user_input):
    completion_prompt = "\nGiven this schema, what resources would be required to retrieve the results of this query: "
    resource_messages = [{'role':'user', 'content': "The following is the schema provided by Open Targets for their graphQL API:\n" + open_targets_schema + "\n\n" + completion_prompt + user_input}]
    resource_response = openai.ChatCompletion.create(
        model=gpt_model,
        messages=resource_messages,
        temperature=0,
        max_tokens=250,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=["###"],
    )
    resource_response_message = resource_response['choices'][0]['message']
    graphql_messages = [
        {
            'role':'user',
            'content': resource_response_message['content']
        },
        {
            'role':'user',
            'content': 'Here are some example queries: \n' + prompt_templates
        },
        {
        'role':'user',
        'content': 'Please do not include extra variables not in the examples. Do not tranform or translate any drug, disease or gene names into chemblId, EFO ID, or ENSEMBL ID. Follow the examples strictly. Do not include extra variables in the query. Please use the resources listed and the example queries to generate a graphQL expression that fetches all the required data to complete the query: "' + user_input + "; query generated_query {\n  search("
        }
    ]
    graphql_response = openai.ChatCompletion.create(
        model=gpt_model,
        messages=graphql_messages,
        temperature=0,
        max_tokens=500,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=["###"],
    )
    return "query generated_query {\n  search(" + graphql_response['choices'][0]['message']['content']
def store_custom_query(user_input, query_string):
    query_file = "query_" + datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p") + ".txt"
    with open(query_file, "w") as f:
        f.write(f"# User input: {user_input}\n")
        f.write(query_string)
        #print(f"\nCustom graphQL query was written to file: {query_file}")
        
def perform_opentargets_query(query):
    # Perform POST request and check status code of response
    # This handles the cases where the Open Targets API is down or our query is invalid
    try:
        response = open_targets_base.post(json={"query": query})
        response.raise_for_status()
    except requests.exceptions.HTTPError as err:
        print(err, file=sys.stderr)
        print(response.content.decode(), file=sys.stderr)
        return None
    except:
        the_type, the_value, the_traceback = sys.exc_info()
        print("exception occurred: %s -> %s" % (str(the_type), str(the_value)), file=sys.stderr)
        return None
    
    return response.content.decode()

def test_input(user_input):
    test_result = generate_query_for_opentargets_using_human_language(user_input)
    store_custom_query(user_input, test_result)
    test_query_result = perform_opentargets_query(test_result)
    return test_result, test_query_result

In [38]:
test_cases = [
    "What are the targets of vorinostat?",
    "Find drugs that are used for treating ulcerative colitis.",
    "Which diseases are associated with the genes targeted by fasudil?",
    "What pathways are associated with breast cancer?",
    "Find the top 2 diseases associated with BRCA1",
    "Show all the diseases that have at least 5 pathways associated with Alzheimers"
]
test_results = []
test_query_results = []
for test_case in test_cases:
    test_result, test_query_result = test_input(test_case)
    test_results.append(test_result)
    test_query_results.append(test_query_result)
    print(test_result)
    print(test_query_result)

query generated_query {
  search(queryString: "vorinostat", entityNames: "drug", page: {index: 0, size: 1}) {
    hits {
      id
      name
      entity
      object {
        ... on Drug {
          mechanismsOfAction {
            rows {
              targets {
                id
                approvedSymbol
              }
            }
          }
        }
      }
    }
  }
}
{"data":{"search":{"hits":[{"id":"CHEMBL98","name":"VORINOSTAT","entity":"drug","object":{"mechanismsOfAction":{"rows":[{"targets":[{"id":"ENSG00000171720","approvedSymbol":"HDAC3"}]},{"targets":[{"id":"ENSG00000094631","approvedSymbol":"HDAC6"}]},{"targets":[{"id":"ENSG00000116478","approvedSymbol":"HDAC1"}]},{"targets":[{"id":"ENSG00000196591","approvedSymbol":"HDAC2"}]}]}}}]}}}
query generated_query {
  search(queryString: "ulcerative colitis", entityNames: "disease", page: {index: 0, size: 1}) {
    hits {
      id
      name
      entity
      object {
        ... on Disease {
          knownDrugs {
 

400 Client Error: Bad Request for url: https://api.platform.opentargets.org/api/v4/graphql
{"data":null,"errors":[{"message":"Unknown argument 'page' on field 'pathways' of type 'Target'. (line 13, column 26):\n                pathways(page: {index: 0, size: 5}) {\n                         ^","locations":[{"line":13,"column":26}]},{"message":"Cannot query field 'rows' on type 'ReactomePathway'. (line 14, column 19):\n                  rows {\n                  ^","locations":[{"line":14,"column":19}]}]}


In [39]:
print(sum([x is None for x in test_query_results]), "error")

1 error
