## Path Setup
Add the parent directory to the Python path so that the notebook can find the modules

In [1]:
import sys
import os

cwd = os.getcwd() # Current working directory
dirname = os.path.dirname(cwd) # Parent directory
print(cwd)
print(dirname)
sys.path.append(dirname)# Add the parent directory to the Python path
print(sys.path)

/Users/idekeradmin/Dropbox/GitHub/agent_evaluation/notebooks
/Users/idekeradmin/Dropbox/GitHub/agent_evaluation
['/Users/idekeradmin/Dropbox/GitHub/agent_evaluation/notebooks', '/opt/anaconda3/lib/python311.zip', '/opt/anaconda3/lib/python3.11', '/opt/anaconda3/lib/python3.11/lib-dynload', '', '/Users/idekeradmin/.local/lib/python3.11/site-packages', '/opt/anaconda3/lib/python3.11/site-packages', '/opt/anaconda3/lib/python3.11/site-packages/aeosa', '/Users/idekeradmin/Dropbox/GitHub/agent_evaluation']


## Get a Hierarchy


In [2]:
from agent_evaluation.hierarchy import Hierarchy
import json
import ndex2 
from ndex2.cx2 import RawCX2NetworkFactory

# Create NDEx2 python client
client = ndex2.client.Ndex2()

# Create CX2Network factory
factory = RawCX2NetworkFactory()

# Download BioGRID: Protein-Protein Interactions (SARS-CoV) from NDEx
# https://www.ndexbio.org/viewer/networks/669f30a3-cee6-11ea-aaef-0ac135e8bacf
# client_resp = client.get_network_as_cx2_stream('669f30a3-cee6-11ea-aaef-0ac135e8bacf')

# Dengue string interactome network c223d6db-b0e2-11ee-8a13-005056ae23aa
client_resp = client.get_network_as_cx2_stream('c223d6db-b0e2-11ee-8a13-005056ae23aa')

# Convert downloaded interactome network to CX2Network object
interactome = factory.get_cx2network(json.loads(client_resp.content))

# Dengue hierarchy
# https://www.ndexbio.org/viewer/networks/59bbb9f1-e029-11ee-9621-005056ae23aa
client_resp = client.get_network_as_cx2_stream('59bbb9f1-e029-11ee-9621-005056ae23aa')

# Convert downloaded interactome network to CX2Network object
hierarchy = factory.get_cx2network(json.loads(client_resp.content))

# Display information about the hierarchy network and output 1st 100 characters of CX2
print('Name: ' + hierarchy.get_name())
print('Number of nodes: ' + str(len(hierarchy.get_nodes())))
print('Number of nodes: ' + str(len(hierarchy.get_edges())))

# Display information about the interactome network 
print('Name: ' + interactome.get_name())
print('Number of nodes: ' + str(len(interactome.get_nodes())))
print('Number of nodes: ' + str(len(interactome.get_edges())))


Name: Dengue model - hidef string 12.0 0.7 (GPT-4 annotated) - L2R
Number of nodes: 203
Number of nodes: 249
Name: dengue string 12.0 0.7
Number of nodes: 1375
Number of nodes: 2792


## Get Datasets

In [3]:
dengue_hierarchy = Hierarchy(hierarchy, interactome)
print(dengue_hierarchy.get_experiment_description())
datasets = dengue_hierarchy.get_datasets(member_attributes=["name", "GeneSymbol", "log2FC_48hpi", "pvalue_48hpi", "log2FC_72hpi", "pvalue_72hpi"],
                                         filter={"max_size": 4})[0:2]
for dataset in datasets:
    print(dataset.data)

None
[{'name': 'DEFB1', 'GeneSymbol': 'DEFB1', 'log2FC_48hpi': 6.97063877863496}, {'name': 'DEFA5', 'GeneSymbol': 'DEFA5'}, {'name': 'BRDT', 'GeneSymbol': 'BRDT'}, {'name': 'DEFB105A', 'GeneSymbol': 'DEFB105'}]
[{'name': 'PEMT', 'log2FC_48hpi': -2.09090804358616}, {'name': 'CHKA', 'GeneSymbol': 'CHKA'}, {'name': 'PTDSS2', 'GeneSymbol': 'PTDSS2'}, {'name': 'PLA1A', 'log2FC_48hpi': 4.07675395617054}]


## Analyst Agents

In [4]:
from agent_evaluation.analyst import Analyst
from agent_evaluation.llm import OpenAI_LLM

gpt35_turbo_1106 = OpenAI_LLM("gpt-3.5-turbo-1106")


analyst_1_context = """
You are a helpful analyst of genomic, proteomic, and other biological data. 
"""

analyst_1_prompt_template = """ 
The attached proteomics "dataset" includes interacting proteins and the measurements of their differential abundance as a ratio between treated and non-treated samples, where the treatment is the infection of human cells with Dengue virus. 
Not all proteins in the dataset have differential abundance measurements.

The dataset has 2 columns with the following headers: name, DV3_24h-Mock_24h. 
The first column contains the protein names and the last columns contains the abundance data. 

Your task is to leverage this dataset to analyze a subset of interacting proteins that are defined as “proteins of interest".

First, determine what proteins of interest show a differential abundance recorded in the dataset. 
Then, based on this information and on the known functions of all other proteins of interest, 
I want you to generate a novel hypothesis describing the mechanisms that may contribute to the disease state 
and could potentially be targeted by drug therapies. 

When presenting your results, please adhere to the following guidelines:

- Avoid including any code.
- Do not describe the analytical steps you took.
- Do not merely list the proteins of interest, regardless whether they show a differential abundance recorded in the dataset or not.
- Build your hypotheses taking into consideration the interplay among all proteins of interest, not only those that show a differential abundance in the dataset.

- Your output should consist solely of the identified proteins of interest with changed abundance levels, the hypotheses you propose, and the reasons supporting these hypotheses.

Here is the set of proteins of interest: 
{data}
"""

analyst_1 = Analyst(gpt35_turbo_1106, analyst_1_context, analyst_1_prompt_template, "Jane", "The first analyst")

analyst_2_context = """
You are a helpful analyst of genomic, proteomic, and other biological data. 
"""

analyst_2_prompt_template = """
The differential abundance of the following proteins were measured in a dengue infection experiment. 
Propose a novel hypothesis for the mechanism of action of these proteins in the context of dengue infection, 
given the known functions of these proteins and the observed changes in abundance.
{data}
"""

analyst_2 = Analyst(gpt35_turbo_1106, analyst_2_context, analyst_2_prompt_template, "John", "The second analyst")

Model: gpt-3.5-turbo-1106, Temperature: 0, Max Tokens: 2048, Seed: 42


## The TestPlan

In [5]:
from agent_evaluation.test import TestPlan
test_plan = TestPlan(analysts=[analyst_1, analyst_2], datasets=datasets)


## Run the Test

In [6]:
from agent_evaluation.test import Test

test = Test(test_plan)
test.run()

Generating hypothesis by Jane on [{'name': 'DEFB1', 'GeneSymbol': 'DEFB1', 'log2FC_48hpi': 6.97063877863496}, {'name': 'DEFA5', 'GeneSymbol': 'DEFA5'}, {'name': 'BRDT', 'GeneSymbol': 'BRDT'}, {'name': 'DEFB105A', 'GeneSymbol': 'DEFB105'}]
Generating hypothesis by John on [{'name': 'DEFB1', 'GeneSymbol': 'DEFB1', 'log2FC_48hpi': 6.97063877863496}, {'name': 'DEFA5', 'GeneSymbol': 'DEFA5'}, {'name': 'BRDT', 'GeneSymbol': 'BRDT'}, {'name': 'DEFB105A', 'GeneSymbol': 'DEFB105'}]
Generating hypothesis by Jane on [{'name': 'PEMT', 'log2FC_48hpi': -2.09090804358616}, {'name': 'CHKA', 'GeneSymbol': 'CHKA'}, {'name': 'PTDSS2', 'GeneSymbol': 'PTDSS2'}, {'name': 'PLA1A', 'log2FC_48hpi': 4.07675395617054}]
Generating hypothesis by John on [{'name': 'PEMT', 'log2FC_48hpi': -2.09090804358616}, {'name': 'CHKA', 'GeneSymbol': 'CHKA'}, {'name': 'PTDSS2', 'GeneSymbol': 'PTDSS2'}, {'name': 'PLA1A', 'log2FC_48hpi': 4.07675395617054}]


In [7]:
for hypothesis in test.hypotheses:
    print(hypothesis.analyst.name)
    print(hypothesis.description)
    print("---")

Jane
('The proteins of interest with differential abundance recorded in the dataset are DEFB1 and DEFA5. \n\nBased on the known functions of all proteins of interest and their potential interplay, a novel hypothesis can be proposed. \n\nHypothesis:\nThe differential abundance of DEFB1 and DEFA5, along with the presence of BRDT and DEFB105A, suggests a potential mechanism contributing to the disease state of Dengue virus infection. DEFB1 and DEFA5 are known to be involved in the immune response and have antimicrobial properties. Their increased abundance may indicate an attempt by the host cells to combat the viral infection. BRDT, a testis-specific protein, has been implicated in chromatin remodeling and transcriptional regulation. Its presence in the context of viral infection may suggest a role in modulating the host cell response to the virus. DEFB105A, a member of the defensin family, may also contribute to the host defense mechanism.\n\nGiven these observations, it is hypothesized