In [85]:
import json
from tqdm import tqdm
from langchain_openai import OpenAI
import os

from langchain_neo4j import Neo4jGraph, Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_neo4j import GraphCypherQAChain
from langchain.prompts.prompt import PromptTemplate
from langchain.schema import StrOutputParser

## agentic set up
from langchain.tools import Tool
from langchain.agents import AgentExecutor, create_react_agent
from langchain import hub

# TrialQuery Demo

We will show an implementation demo of the application

In [1]:
from IPython.display import Image, display
display(Image(filename='gpt_sora_ct_assistant.jpeg', width=300, height=200))

<IPython.core.display.Image object>

## Environment set up

In [55]:
os.environ['LANGCHAIN_API_KEY'] = langsmith_api_key
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ['LANGCHAIN_PROJECT'] = project_name
os.environ['OPENAI_API_KEY'] = openai_api_key

In [89]:
### make sure the LLM works
llm = OpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"))

response = llm.invoke("What is a Large Language Model?")

print(response)



A Large Language Model is a type of artificial intelligence (AI) system designed to understand and generate human language. It uses a deep learning algorithm to process vast amounts of text data and learn the patterns and rules of language. These models are typically trained on huge datasets, such as books, articles, and websites, and can generate coherent and human-like text based on a given prompt or topic. Examples of large language models include OpenAI's GPT-3 and Google's BERT. These models have shown great potential in natural language processing tasks such as text completion, translation, and question-answering.


In [136]:
openai_emb = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY"))

## Construct neo4j graph from json file

-- this script takes the raw ct data and builds the knowledge graph

In [2]:
### load the json file

with open('../data/diabetes-type2-top100.json','r') as f:
    data = json.load(f)
print(data[0])

{'protocolSection': {'identificationModule': {'nctId': 'NCT00976261', 'orgStudyIdInfo': {'id': '112534'}, 'organization': {'fullName': 'GlaxoSmithKline', 'class': 'INDUSTRY'}, 'briefTitle': 'A 2-Part Trial in Subjects With Type 2 Diabetes and in Healthy Subjects to Evaluate GSK1614235, a New Glucose Lowering Drug to Treat Type 2 Diabetes', 'officialTitle': 'A 2-Part Trial: a Randomized 6-day Repeat-dose, Parallel-group Study in Subjects With T2DM to Assess the Safety and Tolerability of GSK1614235 Compared to Placebo and Sitagliptin; and a Randomized Single-dose, Food Effect Study in Healthy Volunteers to Assess Safety and Tolerability of GSK1614235', 'acronym': 'SGA112534'}, 'statusModule': {'statusVerifiedDate': '2017-06', 'overallStatus': 'COMPLETED', 'expandedAccessInfo': {'hasExpandedAccess': False}, 'startDateStruct': {'date': '2009-10-17', 'type': 'ACTUAL'}, 'primaryCompletionDateStruct': {'date': '2010-09-05', 'type': 'ACTUAL'}, 'completionDateStruct': {'date': '2010-09-05', 't

In [27]:
### conditions
data[10]['protocolSection']['conditionsModule']['conditions']

['Diabetes Mellitus, Type 2', 'Cancer']

In [22]:
### detailed trial description
print(data[0]['protocolSection']['descriptionModule']['briefSummary'])
print("--")
print(data[0]['protocolSection']['descriptionModule']['detailedDescription'])


The purpose of Part A of this study is to test whether repeated doses of the study drug (GSK1614235) are safe and well tolerated (i.e. do not produce unacceptable side effects) and whether we can obtain some preliminary information as to whether it works in lowering blood glucose levels. We will do this by comparing the effect of the study drug with placebo (no drug present) and against a drug (sitagliptin) known to control blood glucose in the treatment of diabetes.

The purpose of Part B of this study is to determine the how the timing of dosing, relative to meals, affects the response to study drug.
--
Part A is a double-blind, randomized, parallel group, dose-ranging study, in subjects with type 2 diabetes mellitus to assess safety, tolerability, and to estimate the pharmacodynamic effects when subjects receive repeated doses of placebo, sitagliptin (Januvia), or GSK1614235. The target is to explore the best doses of GSK1614235 that provide glucose control with minimal adverse effe

In [3]:
### locations of the trial
data[0]['protocolSection']['contactsLocationsModule']['locations']

[{'facility': 'GSK Investigational Site',
  'city': 'Miami',
  'state': 'Florida',
  'zip': '33169',
  'country': 'United States',
  'geoPoint': {'lat': 25.77427, 'lon': -80.19366}},
 {'facility': 'GSK Investigational Site',
  'city': 'Baton Rouge',
  'state': 'Louisiana',
  'zip': '70808',
  'country': 'United States',
  'geoPoint': {'lat': 30.45075, 'lon': -91.15455}},
 {'facility': 'GSK Investigational Site',
  'city': 'Cincinnati',
  'state': 'Ohio',
  'zip': '45212',
  'country': 'United States',
  'geoPoint': {'lat': 39.12713, 'lon': -84.51435}},
 {'facility': 'GSK Investigational Site',
  'city': 'Gières',
  'zip': '38610',
  'country': 'France',
  'geoPoint': {'lat': 45.18273, 'lon': 5.79179}},
 {'facility': 'GSK Investigational Site',
  'city': 'Rueil-Malmaison',
  'zip': '92502',
  'country': 'France',
  'geoPoint': {'lat': 48.8765, 'lon': 2.18967}},
 {'facility': 'GSK Investigational Site',
  'city': 'Neuss',
  'state': 'Nordrhein-Westfalen',
  'zip': '41460',
  'country': '

In [28]:
#### input: data['protocolSection']
#### output: formatted result to easily create the graphs

def deep_get(d, sub_dicts,default = None):
    for key in sub_dicts:
        if isinstance(d, dict):
            d = d.get(key, default)
        else:
            return default
    return d
    
def format_ct_data(ct):
    formatted_ct = {
        "id": ct["identificationModule"]["nctId"],
        "title": ct["identificationModule"]["briefTitle"],
        "summary": deep_get(ct, ['descriptionModule','briefSummary']),
        "status": ct["statusModule"]["overallStatus"],
        "startDate": deep_get(ct, ["statusModule", "startDateStruct", "date"]),
        "completionDate": deep_get(ct, ["statusModule", "completionDateStruct", "date"]), 
        "acronym": deep_get(ct, ["identificationModule", "acronym"]),
        "sponsor_name": deep_get(ct, ["identificationModule", "organization", 'fullName']),
        "sponsor_class":deep_get(ct, ["identificationModule", "organization", 'class']),
        "locations": [
            {"city": loc_details['city'], "country": loc_details['country']}
                for loc_details in deep_get(ct, ["contactsLocationsModule","locations"], [])
        ]
    }
    
    return formatted_ct
    
"""
        "publications": [
            {
                "pmid": ref["pmid"],
                "authors":ref["citation"].split('.')[0],
                "title": ref["citation"].split(".")[1],
                "journal": ref["citation"].split(".")[2].strip(),
                "date": ref["citation"].split(".")[3].strip().split(";")[0]
            }
            for ref in ct["referencesModule"]["references"]
        ]
        """
fmt_cts_list = [format_ct_data(i['protocolSection']) for i in data]

#format_ct_data(data[0]['protocolSection'])
fmt_cts_list[0]

{'id': 'NCT00976261',
 'title': 'A 2-Part Trial in Subjects With Type 2 Diabetes and in Healthy Subjects to Evaluate GSK1614235, a New Glucose Lowering Drug to Treat Type 2 Diabetes',
 'summary': 'The purpose of Part A of this study is to test whether repeated doses of the study drug (GSK1614235) are safe and well tolerated (i.e. do not produce unacceptable side effects) and whether we can obtain some preliminary information as to whether it works in lowering blood glucose levels. We will do this by comparing the effect of the study drug with placebo (no drug present) and against a drug (sitagliptin) known to control blood glucose in the treatment of diabetes.\n\nThe purpose of Part B of this study is to determine the how the timing of dosing, relative to meals, affects the response to study drug.',
 'status': 'COMPLETED',
 'startDate': '2009-10-17',
 'completionDate': '2010-09-05',
 'acronym': 'SGA112534',
 'sponsor_name': 'GlaxoSmithKline',
 'sponsor_class': 'INDUSTRY',
 'locations':

In [5]:
### in the future include the results section in the LLM as well
data[12]['resultsSection']

{'participantFlowModule': {'preAssignmentDetails': 'Total number of patients: 12 in a crossover intervention. Interventions: 2. Washout period before the cross-over intervention. Protocol violations were detected at the end of the study in patients of both groups',
  'groups': [{'id': 'FG000',
    'title': 'Insulin Degludec - Insulin Glargine',
    'description': 'Insulin Degludec: 10 IU subcutaneous (SC) every 24 hours for 6 days Washout period for 14 days Insulin Glargine 10 IU SC every 24 hours for 6 days'},
   {'id': 'FG001',
    'title': 'Insulin Glargine - Insulin Degludec',
    'description': 'Insulin Glargine: 10 IU subcutaneous (SC) every 24 hours for 6 days Washout period for 14 days Insulin Degludec 10 IU SC every 24 hours for 6 days'}],
  'periods': [{'title': 'Initial Intervention',
    'milestones': [{'type': 'STARTED',
      'achievements': [{'groupId': 'FG000', 'numSubjects': '6'},
       {'groupId': 'FG001', 'numSubjects': '6'}]},
     {'type': 'COMPLETED',
      'achi

# Connect to a local database

In [167]:
### if connecting to a local service
URI = "bolt://0.0.0.0:7687"
AUTH = ("neo4j", "password")

graph = Neo4jGraph(
    url=URI,
    username=AUTH[0],
    password=AUTH[1]
)

In [254]:
URI = "neo4j+s://156bc6bc.databases.neo4j.io"
AUTH = ("neo4j", "1kpni9FOT0LtarHj5LemDrQVPra6hQBiDEkn7A7F8xQ")

graph = Neo4jGraph(
    url=URI,
    username=AUTH[0],
    password=AUTH[1]
)

## add new data

In [166]:
#####
### delete data
####
#graph.query("""
#    MATCH (n)
#    DETACH DELETE n
#""")

[]

In [273]:
graph.query("""
MATCH (s:Sponsor {name: "GlaxoSmithKline"})-[:SPONSORS]->(ct:ClinicalTrial)-[:IN_CITY]->(c:City)
WHERE ct.title CONTAINS "Diabetes"
RETURN c.name
""")

[{'c.name': 'Miami'},
 {'c.name': 'Baton Rouge'},
 {'c.name': 'Cincinnati'},
 {'c.name': 'Gières'},
 {'c.name': 'Rueil-Malmaison'},
 {'c.name': 'Neuss'},
 {'c.name': 'Berlin'},
 {'c.name': 'Hamburg'},
 {'c.name': 'Cambridge'},
 {'c.name': 'Miami'},
 {'c.name': 'Guadalajara'},
 {'c.name': 'Springfield'},
 {'c.name': 'Sherbrooke'},
 {'c.name': 'Chicago'},
 {'c.name': 'Portland'},
 {'c.name': 'Beaver'},
 {'c.name': 'Philadelphia'},
 {'c.name': 'Columbia'},
 {'c.name': 'Kingsport'},
 {'c.name': 'Dallas'},
 {'c.name': 'Midland'},
 {'c.name': 'Cuernavaca'},
 {'c.name': 'Monterrey'},
 {'c.name': 'Edmonton'},
 {'c.name': 'Rochester'},
 {'c.name': 'Salt Lake City'},
 {'c.name': 'Tucson'},
 {'c.name': 'Fresno'},
 {'c.name': 'Atlanta'},
 {'c.name': 'Billings'},
 {'c.name': 'Clinton'},
 {'c.name': 'Spokane'},
 {'c.name': 'Keswick'},
 {'c.name': 'Box Hill'},
 {'c.name': 'Toronto'},
 {'c.name': 'Pointe-Claire'},
 {'c.name': 'Burke'},
 {'c.name': 'Manassas'},
 {'c.name': 'Wheat Ridge'},
 {'c.name': '

In [256]:
def insert_data(graph, row):
    graph.query("""
    CREATE (t:ClinicalTrial {id:$id})
    SET t.title = $title
    SET t.summary = $summary
    SET t.status = $status
    SET t.start_date = $startDate
    MERGE (sp:Sponsor {name:$sponsor_name, class:$sponsor_class})    
    WITH t, sp
    MERGE (t) <-[:SPONSORS]- (sp) 

    WITH $locations AS locations, t
    UNWIND locations AS l
    
    WITH t, l 
    WHERE l.city IS NOT NULL
    MERGE (city:City {name: l.city})
    MERGE (t)-[:IN_CITY]->(city)
    
    WITH t, l, city
    WHERE l.country is NOT NULL
    MERGE (country:Country {name: l.country})
    MERGE (t)-[:IN_COUNTRY]->(country)
    
    WITH l, city, country
    WHERE l.city is NOT NULL AND l.country IS NOT NULL
    MERGE (city)-[:PART_OF]->(country)
    
    """, row)

In [257]:
for r in tqdm(fmt_cts_list):
    insert_data(graph, r)

100%|██████████| 100/100 [00:21<00:00,  4.67it/s]


In [258]:
records= graph.query("MATCH (n:ClinicalTrial) RETURN n LIMIT 1")
print(records)

[{'n': {'summary': 'The purpose of Part A of this study is to test whether repeated doses of the study drug (GSK1614235) are safe and well tolerated (i.e. do not produce unacceptable side effects) and whether we can obtain some preliminary information as to whether it works in lowering blood glucose levels. We will do this by comparing the effect of the study drug with placebo (no drug present) and against a drug (sitagliptin) known to control blood glucose in the treatment of diabetes.\n\nThe purpose of Part B of this study is to determine the how the timing of dosing, relative to meals, affects the response to study drug.', 'id': 'NCT00976261', 'title': 'A 2-Part Trial in Subjects With Type 2 Diabetes and in Healthy Subjects to Evaluate GSK1614235, a New Glucose Lowering Drug to Treat Type 2 Diabetes'}}]


In [259]:
records[0]['n'].keys()

dict_keys(['summary', 'id', 'title'])

## Add embeddings for vector search

In [261]:
#####
# create embeddings for all movie nodes
####

records = graph.query('MATCH (c:ClinicalTrial) RETURN c.id AS id, c.title AS title, c.summary AS summary')
print("N trials:", len(records))
print(records[0])

def import_batch(graph, nodes_with_embeddings, batch_n):
    # Add embeddings to Movie nodes
    graph.query('''
        UNWIND $trials as trial
        MATCH (c:ClinicalTrial {id: trial.id})
        CALL db.create.setNodeVectorProperty(c, 'embedding', trial.embedding)
    ''', params = {"trials":nodes_with_embeddings})
    print(f'Processed batch {batch_n}.')
    
batch_size = 50
batch_n = 1
trials_with_embeddings = []
    
for record in tqdm(records):
    nct_id = record.get('id')
    title = record.get('title')
    summary = record.get('summary')
    
    # Create embedding for title and plot
    if title is not None and summary is not None:
        trials_with_embeddings.append({'id':nct_id,
                                      'embedding':openai_embeddings.embed_query(f'''Title: {title}\nSummary: {summary}''')})
        # Import when a batch of movies has embeddings ready; flush buffer
        if len(trials_with_embeddings) == batch_size:
            import_batch(graph, trials_with_embeddings, batch_n)
            trials_with_embeddings = []
            batch_n += 1

# Flush last batch
import_batch(graph, trials_with_embeddings, batch_n)

  0%|          | 0/100 [00:00<?, ?it/s]

N trials: 100
{'id': 'NCT00976261', 'title': 'A 2-Part Trial in Subjects With Type 2 Diabetes and in Healthy Subjects to Evaluate GSK1614235, a New Glucose Lowering Drug to Treat Type 2 Diabetes', 'summary': 'The purpose of Part A of this study is to test whether repeated doses of the study drug (GSK1614235) are safe and well tolerated (i.e. do not produce unacceptable side effects) and whether we can obtain some preliminary information as to whether it works in lowering blood glucose levels. We will do this by comparing the effect of the study drug with placebo (no drug present) and against a drug (sitagliptin) known to control blood glucose in the treatment of diabetes.\n\nThe purpose of Part B of this study is to determine the how the timing of dosing, relative to meals, affects the response to study drug.'}


 50%|█████     | 50/100 [00:27<00:43,  1.16it/s]

Processed batch 1.


100%|██████████| 100/100 [00:45<00:00,  2.19it/s]

Processed batch 2.





Processed batch 3.


In [262]:
graph.query("""MATCH (c:ClinicalTrial)
RETURN c LIMIT 1""")

[{'c': {'summary': 'The purpose of Part A of this study is to test whether repeated doses of the study drug (GSK1614235) are safe and well tolerated (i.e. do not produce unacceptable side effects) and whether we can obtain some preliminary information as to whether it works in lowering blood glucose levels. We will do this by comparing the effect of the study drug with placebo (no drug present) and against a drug (sitagliptin) known to control blood glucose in the treatment of diabetes.\n\nThe purpose of Part B of this study is to determine the how the timing of dosing, relative to meals, affects the response to study drug.',
   'id': 'NCT00976261',
   'embedding': [-0.003899355186149478,
    -0.010431353934109211,
    0.02976606972515583,
    -0.012005648575723171,
    0.0056092506274580956,
    0.021775534376502037,
    -0.0028443133924156427,
    -0.0003346615703776479,
    -0.035851575434207916,
    -0.022661900147795677,
    0.015332832932472229,
    0.00832127034664154,
    -0.01

In [263]:
####
### create vector index -- convert movie nodes to vector indexes
####
graph.query("""
    CREATE VECTOR INDEX ctDescription IF NOT EXISTS
    FOR (c:ClinicalTrial)
    ON c.embedding
    OPTIONS {indexConfig: {
     `vector.dimensions`: 1536,
     `vector.similarity_function`: 'cosine'
    }}
""")

[]

In [264]:
graph.query('SHOW INDEXES  YIELD id, name, type, state, populationPercent WHERE type = "VECTOR"')

[{'id': 2,
  'name': 'ctDescription',
  'type': 'VECTOR',
  'state': 'ONLINE',
  'populationPercent': 100.0}]

In [210]:
### drop 
#graph.query("DROP INDEX ctDescription IF EXISTS")

[]

In [265]:
### find similar trials....

graph.query("""
MATCH (c:ClinicalTrial {id:"NCT00976261"})

CALL db.index.vector.queryNodes('ctDescription', 2, c.embedding)
YIELD node, score

RETURN node.id AS nct_id, node.title AS title, score
""")

[{'nct_id': 'NCT00976261',
  'title': 'A 2-Part Trial in Subjects With Type 2 Diabetes and in Healthy Subjects to Evaluate GSK1614235, a New Glucose Lowering Drug to Treat Type 2 Diabetes',
  'score': 0.997406005859375},
 {'nct_id': 'NCT01435616',
  'title': 'A Study in Patients With Type 2 Diabetes Mellitus',
  'score': 0.9452056884765625}]

In [266]:
ct_trial_vector = Neo4jVector.from_existing_index(
    openai_emb,
    graph=graph,
    index_name="ctDescription",
#    embedding_node_property="embedding",
    retrieval_query = """
    RETURN node.title AS text, score,
    {
        id: node.id,
        source: 'https://clinicaltrials.gov/study/'+node.id,
        summary: node.summary
    } AS metadata
    """
)

In [267]:
result = ct_trial_vector.similarity_search("Trials that evaluate Glucose lowering drug for Type 2 Diabetes", k = 1)
result[0].metadata

{'summary': 'The purpose of Part A of this study is to test whether repeated doses of the study drug (GSK1614235) are safe and well tolerated (i.e. do not produce unacceptable side effects) and whether we can obtain some preliminary information as to whether it works in lowering blood glucose levels. We will do this by comparing the effect of the study drug with placebo (no drug present) and against a drug (sitagliptin) known to control blood glucose in the treatment of diabetes.\n\nThe purpose of Part B of this study is to determine the how the timing of dosing, relative to meals, affects the response to study drug.',
 'id': 'NCT00976261',
 'source': 'https://clinicaltrials.gov/study/NCT00976261'}

In [268]:
for doc in result:
    print('NCT ID:',doc.metadata['id'],'\nLink:',doc.metadata['source'], '\nTitle:', doc.page_content,'\nSummary:',doc.metadata['summary'])
    print("--")


NCT ID: NCT00976261 
Link: https://clinicaltrials.gov/study/NCT00976261 
Title: A 2-Part Trial in Subjects With Type 2 Diabetes and in Healthy Subjects to Evaluate GSK1614235, a New Glucose Lowering Drug to Treat Type 2 Diabetes 
Summary: The purpose of Part A of this study is to test whether repeated doses of the study drug (GSK1614235) are safe and well tolerated (i.e. do not produce unacceptable side effects) and whether we can obtain some preliminary information as to whether it works in lowering blood glucose levels. We will do this by comparing the effect of the study drug with placebo (no drug present) and against a drug (sitagliptin) known to control blood glucose in the treatment of diabetes.

The purpose of Part B of this study is to determine the how the timing of dosing, relative to meals, affects the response to study drug.
--


## Semantic search set up

We can create a retrieval wrapper to conduct the search based on the input query.

In [232]:
#### retrieval_query allows us to input whatever additional info we want about the nodes
### e.g we can pass the nodes into our retrieval query to pull connected data of those similar nodes. 
## We also have to return the text, score, and metadata variables.
### refer to this blog post for more info: https://neo4j.com/blog/developer/rag-graph-retrieval-query-langchain/
ct_trial_vector = Neo4jVector.from_existing_index(
    openai_emb,
    graph=graph,
    index_name="ctDescription",
    retrieval_query = """
    RETURN node.title AS text, score,
    {
        id: node.id,
        source: 'https://clinicaltrials.gov/study/'+node.id,
        summary: node.summary
    } AS metadata
    """
)

ct_trial_retriever = ct_trial_vector.as_retriever()

# Create the prompt
instructions = (
    "Use the given context to answer the question."
    "If you don't know the answer, say you don't know."
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [("system", instructions),
     ("human", "{input}")
     ]
)

# Create the chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
ct_retriever = create_retrieval_chain(
    ct_trial_retriever,
    question_answer_chain
)

### we will use the retriever to get context around relevant documents (i.e. vector search RAG)
response = ct_retriever.invoke(
    {"input": "A trial that evaluates Glucose lowering drug for Type 2 Diabetes"}
)

print(response)

{'input': 'A trial that evaluates Glucose lowering drug for Type 2 Diabetes', 'context': [Document(metadata={'id': 'NCT00976261', 'summary': 'The purpose of Part A of this study is to test whether repeated doses of the study drug (GSK1614235) are safe and well tolerated (i.e. do not produce unacceptable side effects) and whether we can obtain some preliminary information as to whether it works in lowering blood glucose levels. We will do this by comparing the effect of the study drug with placebo (no drug present) and against a drug (sitagliptin) known to control blood glucose in the treatment of diabetes.\n\nThe purpose of Part B of this study is to determine the how the timing of dosing, relative to meals, affects the response to study drug.', 'source': 'https://clinicaltrials.gov/study/NCT00976261'}, page_content='A 2-Part Trial in Subjects With Type 2 Diabetes and in Healthy Subjects to Evaluate GSK1614235, a New Glucose Lowering Drug to Treat Type 2 Diabetes'), Document(metadata={

## Graph based search set up

We will use a language model to generate Cypher queries to query a Neo4j graph database

In [125]:
CYPHER_GENERATION_TEMPLATE = """
You are an expert Neo4j Developer translating user questions into Cypher to answer questions about movies and provide recommendations.
Convert the user's question based on the schema.

Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.

Do not return node embeddings.

Example Cypher Statements:

1. To find number of clinical trials in the US:
```
MATCH (ct:ClinicalTrial)-[:IN_COUNTRY]->(c:Country {{name: "United States"}})
RETURN COUNT(ct)
```

Schema: {schema}
Question: {question}

Cypher Query:
"""

cypher_generation_prompt = PromptTemplate(
    template=CYPHER_GENERATION_TEMPLATE,
    input_variables=["schema", "question"],
)

cypher_chain = GraphCypherQAChain.from_llm(
    llm,
    graph=graph,
    cypher_prompt=cypher_generation_prompt,
    verbose=True,
    allow_dangerous_requests=True ### be careful here
)

In [127]:
result = cypher_chain.invoke({"query": "What is the title of a Type 2 Diabetes trial in the United States?"})
print(result)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m
MATCH (ct:ClinicalTrial)-[:IN_COUNTRY]->(c:Country {name: "United States"})
WHERE ct.title CONTAINS "Type 2 Diabetes"
RETURN ct.title
[0m
Full Context:
[32;1m[1;3m[{'ct.title': 'A 2-Part Trial in Subjects With Type 2 Diabetes and in Healthy Subjects to Evaluate GSK1614235, a New Glucose Lowering Drug to Treat Type 2 Diabetes'}, {'ct.title': 'Evaluation of the Impact of Intensive Short-Term Drug Therapy in Patients With Type 2 Diabetes Mellitus'}, {'ct.title': 'Changing Physical Activity Behavior in Individuals With Type 2 Diabetes Using Counceling and Information From Continuous Glucose Monitoring'}, {'ct.title': 'Evaluate HM-002-1005 in Subjects With Type 2 Diabetes Mellitus'}, {'ct.title': 'Study of Rivoglitazone in Type 2 Diabetes Mellitus'}, {'ct.title': 'Evaluation of Intra-Abdominal Fat Extraction Using HydraSolve T2D™ in Obese Subjects With Type 2 Diabetes'}, {'ct.title': 'Efficacy of Mifepri

In [131]:
### note: the model hallucinates here ; keep running until you get the output listed below (trust me, you will)
### one way to reduce hallucination is by providing examples...
result = cypher_chain.invoke({"query": "How many Type 2 Diabetes trials are there in the United States?"})
result



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m
MATCH (ct:ClinicalTrial)-[:IN_COUNTRY]->(c:Country {name: "United States"})
WHERE ct.title CONTAINS "Type 2 Diabetes"
RETURN COUNT(ct)[0m
Full Context:
[32;1m[1;3m[{'COUNT(ct)': 16}][0m

[1m> Finished chain.[0m


{'query': 'How many Type 2 Diabetes trials are there in the United States?',
 'result': ' The United States has a total of 16 Type 2 Diabetes trials.'}

In [132]:
### lets verify this is working
graph.query("""
MATCH (ct:ClinicalTrial)-[:IN_COUNTRY]->(c:Country {name: "United States"})
WHERE ct.title CONTAINS "Type 2 Diabetes"
RETURN COUNT(ct)
""")

[{'COUNT(ct)': 16}]

## Wrap it into an agentic framework

We have two set ups here : 

(a) Vector based search -- we search for similar clinical trials based on the query and return the output

(b) Graph based search -- we conduct a graph based search

These are essentially two **Tools** that help us answer the prompt. 

> We can design an *agentic* framework where the LLM chooses the desired tools and generates the response. 

In [249]:
#### prompt wrappers

####
##### 1. Create a ct chat chain
####
chat_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a clinical trials expert providing information about clinical trials."),
        ("human", "{input}"),
    ]
)
ct_chat = chat_prompt | llm | StrOutputParser()

####
### 2. semantic search wrapper
####

ct_trial_vector = Neo4jVector.from_existing_index(
    openai_emb,
    graph=graph,
    index_name="ctDescription",
    retrieval_query = """
    RETURN node.title AS text, score,
    {
        id: node.id,
        source: 'https://clinicaltrials.gov/study/'+node.id,
        summary: node.summary
    } AS metadata
    """
)

ct_trial_retriever = ct_trial_vector.as_retriever()

# Create the prompt
instructions = (
    "Use the given context to answer the question."
    "If you don't know the answer, say you don't know."
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [("system", instructions),
     ("human", "{input}")
     ]
)

# Create the chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
ct_retriever = create_retrieval_chain(
    ct_trial_retriever,
    question_answer_chain
)

# Create a function to call the chain
def get_ct_description(input):
    return ct_retriever.invoke({"input":input})

#### 
### 3. graph cypher search
####

CYPHER_GENERATION_TEMPLATE = """
You are an expert Neo4j Developer translating user questions into Cypher to answer questions about movies and provide recommendations.
Convert the user's question based on the schema.

Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.

Example Cypher Statements:

1. To find number of clinical trials in the US:
```
MATCH (ct:ClinicalTrial)-[:IN_COUNTRY]->(c:Country {{name: "United States"}})
RETURN COUNT(ct)
```

2. Get diabetes clinical trials:
```
MATCH (ct:ClinicalTrial)
WHERE ct.title CONTAINS 'diabetes'
RETURN ct.id, ct.title, ct.summary
```


Schema: {schema}
Question: {question}

Cypher Query:
"""

cypher_generation_prompt = PromptTemplate(
    template=CYPHER_GENERATION_TEMPLATE,
    input_variables=["schema", "question"],
)

cypher_chain = GraphCypherQAChain.from_llm(
    llm,
    graph=graph,
    cypher_prompt=cypher_generation_prompt,
    verbose=True,
    allow_dangerous_requests=True ### be careful here
)


In [250]:
####
### tool wrappers
####

# Create a set of tools
tools = [
    Tool.from_function(
        name="General Chat",
        description="For general clinical trials chat not covered by other tools",
        func=ct_chat.invoke,
    ),
    Tool.from_function(
        name = "Clinical trial Search",
        description = "For when you need to search clinical trials based on desciption of the trial",
        func=get_ct_description,
    ),
    Tool.from_function(
        name = 'Clinical trials knowledge information',
        description = 'Answer clinical trials related questions that require a knowledge graph to answer using Cypher',
        func = cypher_chain
    )
]


In [251]:
prompt = hub.pull("hwchase17/react")
agent = create_react_agent(llm, tools, prompt)
agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True
)

In [252]:
agent_executor.invoke(
    {
        "input": "What's the summary of trial NCT04027023?",
    }
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I should use the Clinical trial Search tool to find the summary of this trial.
Action: Clinical trial Search
Action Input: NCT04027023[0m[33;1m[1;3m{'input': 'NCT04027023', 'context': [Document(metadata={'id': 'NCT04027023', 'summary': 'This phase IV study is a prospective open-label multi-center study to investigate the effect of a temporary individualized poly-pharmaceutical De-escalation treatment with the target to regenerate ß-cell function over 12 weeks on the disease stage and glycemic control in patients with type 2 diabetes. This is an uncontrolled pilot study to collect data for later confirmatory trials.', 'source': 'https://clinicaltrials.gov/study/NCT04027023'}, page_content='Evaluation of the Impact of Intensive Short-Term Drug Therapy in Patients With Type 2 Diabetes Mellitus'), Document(metadata={'id': 'NCT04686201', 'summary': "this study is to measure the effectiveness of a diabetes mobile application fo

{'input': "What's the summary of trial NCT04027023?",
 'output': 'This phase IV study is a prospective open-label multi-center study to investigate the effect of a temporary individualized poly-pharmaceutical De-escalation treatment with the target to regenerate ß-cell function over 12 weeks on the disease stage and glycemic control in patients with type 2 diabetes.'}