# Sparks
A. Ghafarollahi, M.J. Buehler*

Massachusetts Institute of Technology

*mbuehler@MIT.EDU

# OpenAI API Key

In [1]:
import os
import json

with open('config.json') as f:
    config = json.load(f)

your_openai_api_key = config["your_openai_api_key"]
os.environ['OPENAI_API_KEY'] = your_openai_api_key

chroma_key = config["your_chroma_key"]
os.environ['CHROMA_KEY'] = chroma_key

# Define Query, Tools, Constraints

In [2]:
import json
import sys
from pathlib import Path
import shutil

####################################################################################
# the project directory where all files will be stored
base_dir = 'protein_principle' 
####################################################################################
# User query
query = 'Discover a novel principle in short protein peptides.'
####################################################################################
# Build all tools
from Sparks.Sparks_utils import Toolset, Tool, make_parameters

toolset = Toolset()

tool_1 = Tool(
    name="calculate_force_from_seq",
    description=(
        "This function takes a protein sequence in one-letter amino acid code, "
        "performs the required analysis or simulation, and returns the "
        "non-dimensional protein force (defined as the maximum force along the protein unfolding force-extension curve) as a float. "
        "The function might lead to None results. So, the output should always be checked."
    ),
    parameters=make_parameters(
        sequence=("string", "Protein sequence in standard one-letter amino acid codes (e.g., 'MKTFFV...').")
    )
)

tool_2 = Tool(
    name="calculate_energy_from_seq",
    description=(
        "This function takes a protein sequence in one-letter amino acid code, "
        "performs the required analysis or simulation, and returns the "
        "non-dimensional protein energy (defined as the area under the protein unfolding force-extension curve) as a float. "
        "The function might lead to None results. So, the output should always be checked."
    ),
    parameters=make_parameters(
        sequence=("string", "Protein sequence in standard one-letter amino acid codes (e.g., 'MKTFFV...').")
    )
)

tool_3 = Tool(
    name="design_protein_from_length",
    description=(
        "This function creates random protein sequences of a given length. "
        "It returns the amino acid sequence (string format) of the generated protein."
    ),
    parameters=make_parameters(
        length=("integer", "Desired number of amino acids in the protein sequence")
    )
)

tool_4 = Tool(
    name="design_protein_from_CATH",
    description=(
        "This function creates random alpha-helix, beta-sheet, or mixed protein sequences of a given length. "
        "It returns the amino acid sequence (string format) of the generated protein. "
        "The function can be used to create proteins biased toward alpha-helix (CATH=1), beta-sheet (CATH=2), or mixed structures (CATH=3). "
        "However, the output may not fully reflect the desired structural class, especially for short sequences. "
        "To ensure the proteins follow the desired structure, the secondary structure of folded sequences should be analyzed."
    ),
    parameters=make_parameters(
        length=("integer", "Desired number of amino acids in the protein sequence"),
        cath=("string", 'Structure bias: "1" for alpha-helix, "2" for beta-sheet, "3" for mixed')
    )
)

tool_5 = Tool(
    name="analyze_protein_structure",
    description=(
        "This function returns the secondary structure percentage content of a protein from a PDB file. "
        "The output is a dictionary with 8-class DSSP secondary structure elements as keys: "
        '{"H", "B", "E", "G", "I", "T", "S", "P", "-"}. Each key maps to the corresponding percentage value (float).'
    ),
    parameters=make_parameters(
        protein_struct=("string", "Protein structure file in PDB format")
    )
)

tool_6 = Tool(
    name="fold_protein",
    description=(
        "This function takes a protein amino acid sequence, folds it and creates the protein PDB file. "
        "Returns the PDB file name (string format) of the resulting 3D folded structure. "
        "The sequence should be in FASTA format and contain only valid amino acid characters (no special symbols like '-')."
    ),
    parameters=make_parameters(
        sequence=("string", "Protein sequence in FASTA format (only amino acid characters, no special symbols)."),
        name=("string", "Name to be used for saving the folded protein structure.")
    )
)

toolset.add(tool_1)
toolset.add(tool_2)
toolset.add(tool_3)
toolset.add(tool_4)
toolset.add(tool_5)
toolset.add(tool_6)

tools = toolset.as_json()
####################################################################################
# Computational constrainsts
constraints = '''
1- During the research, assume protein lengths to be equal or between 30 and 80.
2- Consider protein sequences in intervals of 10.
3- For computational efficiency, restrict the total number of samples per protein length to 10.
'''
constraints = json.dumps(constraints, indent=4)
####################################################################################
# Total number of follow-up rounds
total_followup_rounds = 3
####################################################################################

In [None]:
os.environ["PROJECT_DIR"] = str(Path(base_dir).resolve())

try:
    os.mkdir(base_dir)
except:
    print('Directory exists.')

try:
    shutil.copy('./Sparks/functions.py', base_dir)
except:
    pass

try:
    shutil.copy('./Sparks/MD_protein.py', base_dir)
except:
    pass

try:
    shutil.copy('./Sparks/utils_ForceGPT.py', base_dir)
except:
    pass

shutil.copytree('./Sparks/0_codes', base_dir+'/0_codes', dirs_exist_ok=True)

sys.path.append('./Sparks')
from Sparks_functions import *

In [None]:
# Idea generation
generate_idea(query, tools, constraints)

# Idea testing
test_idea(query, tools, constraints)
initial_implementation()

# Refinement
last_exp_round = refine_approach(query, constraints, total_followup_rounds)

# Create plots
create_reason_plots(query, last_exp_round)

# Documentation
create_introduction(query)
create_methods(query, tools, constraints, last_exp_round)
create_results()
create_conclusion(query)
create_outlook(query, tools)
create_pdf()