# Notebook for creating agents programaticly 

### set up parent directory 

In [1]:
import sys
import os

# Add the parent directory of the current script to the Python path
cwd = os.getcwd()
dirname = os.path.dirname(cwd)
print(cwd)
print(dirname)
sys.path.append(dirname)

print(sys.path)

/Users/samuelpeccoud/Documents/Development/Cytoscape_internship/agent_evaluation/notebooks
/Users/samuelpeccoud/Documents/Development/Cytoscape_internship/agent_evaluation
['/usr/local/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python311.zip', '/usr/local/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11', '/usr/local/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/lib-dynload', '', '/Users/samuelpeccoud/Library/Python/3.11/lib/python/site-packages', '/usr/local/lib/python3.11/site-packages', '/usr/local/opt/python-tk@3.11/libexec', '/Users/samuelpeccoud/Documents/Development/Cytoscape_internship/agent_evaluation']


### Load database 

- make sure there is a ~/ae_config/config.ini file for all the configs, and ~/ae_database/ae_database.db

In [2]:
from models.analysis_plan import AnalysisPlan
from services.analysisrunner import AnalysisRunner
from app.sqlite_database import SqliteDatabase
from app.config import load_database_config
%reload_ext autoreload
%autoreload 2

# Load the db connection details
# db_type, uri, user, password = load_database_config(path='~/ae_config/test_config.ini')
# self.db = Database(uri, db_type, user, password)

_, database_uri, _, _ = load_database_config()
db = SqliteDatabase(database_uri)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
analysts = db.find("analyst")
print(analysts)

### Load Data 

- make sure upload the data

In [3]:
import pandas as pd 
from models.dataset import Dataset
dengue_dataset_id = 'dataset_41d106e0-7268-4ac4-b933-b124e469df46'
# Load the dataset 
dengue_dataset = Dataset.load(db, dengue_dataset_id)

# Make a DataFrame from the dataset
from io import StringIO
csv_data = StringIO(dengue_dataset.data)
# Read the CSV string into a DataFrame
df = pd.read_csv(csv_data)
df.head()

Unnamed: 0,A - HGNC,B - Protein abundance 24 hours after infection vs mock control,C - Protein abundance 48 hours after infection vs mock control,"D - siRNA Screen Average Zscore (the higher the score, the stronger the negative effect on viral replication)",E - mRNA expression 24 hours after infection (log2FC_24hpi),F - mRNA expression 48 hours after infection (log2FC_48hpi)
0,ZWILCH,,,1.406935,,
1,ZSCAN9,,,1.680725,,
2,ZSCAN16,,,1.882438,,
3,ZSCAN10,,,1.498169,,
4,ZNRF3,,,1.363028,,


### Create a new LLM 

In [4]:
from models.llm import LLM
from models.analyst import Analyst

# Create an LLM
# the object should be treated as read-only. The database should be the source of truth.
new_llm_llama= LLM.create(db, type="Groq", model_name="llama-3.1-70b-versatile", max_tokens=2048, seed=42, temperature=0.5, name='testLLM_llama3.1', description='test run')  # can change the type, model_name, max_tokens, seed, temperature
print(new_llm_llama.object_id)
print(vars(new_llm_llama))

llm_94c7a932-4ef3-44b8-933b-fb1275357413
{'db': <app.sqlite_database.SqliteDatabase object at 0x155530275710>, 'type': 'Groq', 'model_name': 'llama-3.1-70b-versatile', 'max_tokens': 2048, 'seed': 42, 'temperature': 0.5, 'object_id': 'llm_94c7a932-4ef3-44b8-933b-fb1275357413', 'created': {'type': 'Groq', 'model_name': 'llama-3.1-70b-versatile', 'max_tokens': 2048, 'seed': 42, 'temperature': 0.5, 'name': 'testLLM_llama3.1', 'description': 'test run', 'created': '07.30.2024 11:54:00'}}


### Create prompt template

In [5]:
agent_context = 'You are an insightful assistant to a molecular biologist'

prompt_template = '''
{experiment_description}

Here is the dataset: {data}

Your task is to use the provided dataset to generate a hypothesis describing mechanisms that may contribute to the disease state and could potentially be targeted by drug therapies. Each hypothesis should meet the following criteria:

1. Incorporate Data Columns: use the specific columns A, B, C, D, E and F from the provided dataset.

2. Verify Data Points: ensure that each data point (protein abundance, mRNA expression, siRNA Z-score) used in your hypothesis is directly extracted from and cross-checked with the provided dataset.

3. Include Molecular Mechanisms: each hypothesis should include one or more molecular mechanisms involving genes/proteins present in the dataset.

4. Plausibility and Novelty: hypothesis must be plausible, grounded in known molecular functions and interactions, and propose mechanisms either not known or not known to be relevant to the experimental context.

5. Actionability: hypothesis should be actionable and can be validated with relatively low-cost experimental techniques.

6. Context focused: if a biological context is provided (such as a specific biological process or a list of genes/proteins of interest), use this information to focus your hypothesis.

The biological context is: {biological_context}

When presenting your results, adhere to the following guidelines:

Avoid including any code.
Do not describe the analytical steps you took.
Your output should consist solely of the hypothesis you propose. Each hypothesis should:
- Mention the relevant genes with accurate measurement values as shown in the dataset provided.
- Provide details about other prior knowledge that supports the hypothesis.
- Follow the 6 criteria described above.

To help you in yuour task, here is an example of what one of your hypotheses might look like:

The differential abundance of SP110 (0.842989962), PARP9 (0.971403167), and SAMD9L (1.172951102) in human cells infected with Dengue virus suggests a coordinated response mechanism that may contribute to the disease state. SP110 is a nuclear body protein involved in innate immunity and has been implicated in viral defense mechanisms. Its increased abundance suggests an enhanced cellular attempt to counteract Dengue virus infection through the modulation of immune responses. PARP9 and its interaction partner, DTX3L (also upregulated both at 24 and 48 hours and closely associated with PARP9 in immune responses), are involved in the ADP-ribosylation process, a post-translational modification known to play a role in the regulation of viral infections and inflammation. The increased abundance of PARP9 could indicate an upregulation of ADP-ribosylation processes aimed at restricting viral replication or modulating inflammatory responses. SAMD9L, another protein showing increased abundance, is known for its antiviral activity against various viruses, although its specific role in Dengue virus infection is not well characterized.
Given the roles of these proteins in immune responses and antiviral activities, we propose a novel molecular mechanism where SP110 enhances the cellular immune defense against Dengue virus, possibly through the upregulation of genes involved in innate immunity. Concurrently, PARP9, potentially in synergy with DTX3L, may be upregulated to modify key viral or host proteins through ADP-ribosylation, aiming to inhibit viral replication or assembly. SAMD9L's increased abundance suggests its potential role in directly targeting Dengue virus components or pathways critical for its life cycle, further contributing to the cellular antiviral state.
Considering that none of the genes discussed above is known to interact with Dengue viral proteins and that the dataset doesn't include any siRNA screening information, this hypothesis could be validated through relatively low-cost experimental techniques such as siRNA knockdown of SP110, PARP9, and SAMD9L in Dengue virus-infected cell cultures, followed by qPCR and Western blot analyses to assess the effect on viral replication, fitness and gene expression. Additionally, co-immunoprecipitation experiments could elucidate the interaction between PARP9 and DTX3L in the context of Dengue virus infection, providing insights into the functional consequences of their increased abundance and possibly identify unknown interactions with viral proteins. This approach not only offers a plausible explanation for the observed changes in protein abundance but also identifies potential targets for antiviral drug development.

Now generate your hypothesis.
'''

In [None]:
<task>
Based on the experiment description, analyze the dataset and develop a mechanistic, causal hypothesis for the processes that led to the observed data. The proteins/genes in the dataset are hypothesized to be known to interact; use your knowledge of these proteins and their interactions to develop chains of events that connect experimental perturbations with molecular and phenotypic observations. The hypothesis should make specific predictions that could be experimentally validated. 
</task>

<biological_context>
The hypothesis should support the higher-level goal of developing drug therapies for the disease.
</biological_context>

<detailed_instructions>
1. Review the data to ensure that you understand the meaning of each observation. In your hypothesis, be sure that you correctly use the data and do not hallucinate any observations.

2. Review your knowledge of the functions of these proteins and the known interactions between them.

3. Based on the data and your knowledge, construct the hypothesis that you think best meets these criteria:
- Plausible
- Non-trivial
- Supports the higher-level goal of drug development
- Novel
- Actionable: is cost-effective in both time and money

4. Your lab has limited resources in both reagents, equipment, and your time. Remember, your time is precious. You must use it well if you are to get your doctorate. If you do not think that there is any hypothesis that is worth following up with a validation experiment, say so.

5. Output your hypothesis as follows:

## Knowledge Graph:
concise knowledge graph of as a list of the causal relationships between proteins, complexes, events, disease states, etc. "Therefore:" indicates hypothesized relationships. For example:

((A) binds (B)) inactivates (B)
(B) performs (phosphorylation C)
(phosphorylation C) increases (active C)
Therefore: (A) decreases (active C)


## Hypothesis:
short descriptive paragraph. 

</detailed_instructions>

<experiment_description>
{experiment_description}
</experiment_description>

<dataset>
{data}
</data>

### Create an analyst

In [6]:
from models.analyst import Analyst
# Create analyst
llm_id = new_llm_llama.object_id

the_analyst = Analyst.create(db, llm_id, agent_context, prompt_template, name='Analyst lab rat', description="test run with llama-3.1-70b-versatile, run programaically")

print(the_analyst.object_id)
print(vars(the_analyst))

analyst_8de5dc43-a7e3-4c26-9f79-295620d59379
{'db': <app.sqlite_database.SqliteDatabase object at 0x155530275710>, 'llm_id': 'llm_94c7a932-4ef3-44b8-933b-fb1275357413', 'context': 'You are an insightful assistant to a molecular biologist', 'prompt_template': "\n{experiment_description}\n\nHere is the dataset: {data}\n\nYour task is to use the provided dataset to generate a hypothesis describing mechanisms that may contribute to the disease state and could potentially be targeted by drug therapies. Each hypothesis should meet the following criteria:\n\n1. Incorporate Data Columns: use the specific columns A, B, C, D, E and F from the provided dataset.\n\n2. Verify Data Points: ensure that each data point (protein abundance, mRNA expression, siRNA Z-score) used in your hypothesis is directly extracted from and cross-checked with the provided dataset.\n\n3. Include Molecular Mechanisms: each hypothesis should include one or more molecular mechanisms involving genes/proteins present in the

### Create a new analysis plan

In [33]:
# Create a new analysis plan
# 3 hypotheses per analyst
analysis_name = 'dengue full clean cols_A-F - CH test trial'
analysis_plan = AnalysisPlan.create(db, analysis_name, [the_analyst.object_id], dengue_dataset_id, n_hypotheses_per_analyst=3, description='test analysis print prompt')
print(analysis_plan.object_id)
print(vars(analysis_plan))

analysis_plan_7ce12430-7de9-4616-acef-ff0d7e8f9a65
{'db': <app.sqlite_database.SqliteDatabase object at 0x1554fe176050>, 'name': 'dengue full clean cols_A-F - CH test trial', 'analyst_ids': ['analyst_8de5dc43-a7e3-4c26-9f79-295620d59379'], 'dataset_id': 'dataset_41d106e0-7268-4ac4-b933-b124e469df46', 'n_hypotheses_per_analyst': 3, 'biological_context': 'test analysis print prompt', 'description': None, 'object_id': 'analysis_plan_7ce12430-7de9-4616-acef-ff0d7e8f9a65', 'created': {'name': 'dengue full clean cols_A-F - CH test trial', 'analyst_ids': ['analyst_8de5dc43-a7e3-4c26-9f79-295620d59379'], 'dataset_id': 'dataset_41d106e0-7268-4ac4-b933-b124e469df46', 'n_hypotheses_per_analyst': 3, 'description': 'test analysis print prompt', 'created': '07.30.2024 12:41:32'}}


### Create new analysis run 

In [34]:
analysis_run = analysis_plan.generate_analysis_run(biological_context="Chromatin remodelling and transcriptional regulation (or Transcriptional regulation and chromatin remodelling)")
print(analysis_run.object_id)
print(vars(analysis_run))

analysis_run_3b72c88c-8908-4499-afb8-36f63371d10d
{'db': <app.sqlite_database.SqliteDatabase object at 0x1554fe176050>, 'analysis_plan_id': 'analysis_plan_7ce12430-7de9-4616-acef-ff0d7e8f9a65', 'analyst_ids': ['analyst_8de5dc43-a7e3-4c26-9f79-295620d59379'], 'dataset_id': 'dataset_41d106e0-7268-4ac4-b933-b124e469df46', 'n_hypotheses_per_analyst': 3, 'hypothesis_ids': [], 'biological_context': 'Chromatin remodelling and transcriptional regulation (or Transcriptional regulation and chromatin remodelling)', 'description': None, 'run_log': '', 'attempts': {'analyst_8de5dc43-a7e3-4c26-9f79-295620d59379': []}, 'status': 'pending', 'object_id': 'analysis_run_3b72c88c-8908-4499-afb8-36f63371d10d', 'created': {'analysis_plan_id': 'analysis_plan_7ce12430-7de9-4616-acef-ff0d7e8f9a65', 'analyst_ids': ['analyst_8de5dc43-a7e3-4c26-9f79-295620d59379'], 'dataset_id': 'dataset_41d106e0-7268-4ac4-b933-b124e469df46', 'n_hypotheses_per_analyst': 3, 'hypothesis_ids': [], 'biological_context': 'Chromatin re

In [35]:
runner = AnalysisRunner(db, analysis_run.object_id)
result = runner.run()
print(result)

3 new hypothesis generated by analyst analyst_8de5dc43-a7e3-4c26-9f79-295620d59379.
No more hypotheses needed.



### Print the hypotheses

In [36]:
hypotheses = db.load(analysis_run.object_id)[0]['hypothesis_ids']
hypotheses

['hypothesis_e20b5338-4fd6-4cd2-bcf8-816526f33ef3',
 'hypothesis_71e56eff-0728-4b7b-a253-9d2ab8f798ad',
 'hypothesis_7f5e34c5-9554-4c93-8da2-4db6f6870538']

In [37]:
from models.hypothesis import Hypothesis
%reload_ext autoreload
%autoreload 2

for hypothesis_id in hypotheses:
    hypothesis = Hypothesis.load(db, hypothesis_id)
    # print(hypothesis_id)
    print(hypothesis.object_id)
    print(hypothesis.hypothesis_text)
    print(hypothesis.full_prompt)

hypothesis_e20b5338-4fd6-4cd2-bcf8-816526f33ef3
Hypothesis 1:

The upregulation of genes involved in chromatin remodeling and transcriptional regulation, such as KDM5B (1.828207283), KDM4A (1.204569874), and SETD7 (1.095402015), in response to Dengue virus infection suggests a coordinated epigenetic response mechanism. KDM5B, a histone demethylase, and KDM4A, a histone demethylase and transcriptional regulator, may be upregulated to modify chromatin structure and facilitate the transcription of antiviral genes. SETD7, a histone methyltransferase, may also be involved in this process, potentially targeting specific histone marks to regulate gene expression. This epigenetic response may be crucial for the cellular defense against Dengue virus, as it could influence the expression of genes involved in innate immunity and antiviral activities.

Prior knowledge supports the role of histone modifications in regulating gene expression and immune responses. For example, histone H3K4me3, a mark