In [15]:
%cd ..

/Users/admin/repos/sbol_llm


In [21]:
from openai import OpenAI
OPEN_AI_API_KEY = ""
client = OpenAI(api_key=OPEN_AI_API_KEY)

# Upload knowledge base to OpenAI

In [22]:
assistant_knowledge_base_files = {'docs/fair_publication.pdf': '', 
                                  'docs/miriam_publication.pdf': ''}

for file_path in assistant_knowledge_base_files.keys():
    with open(file_path, 'rb') as file:
        response = client.files.create(file=file, purpose='assistants')
        assistant_knowledge_base_files[file_path] = response.id
assistant_knowledge_base_files

# Define System Prompt

In [None]:
SYSTEM_PROMPT = f"""
    You are a genetic circuit description generator. You will be given the components of a genetic circuit, with annotations for the parts which may include, the specific gene, the part type according to standard ontology, links for annotations, short descriptions of the parts. Your job is to describe the circuit in plain English. 
    
The description you provide must meet the following criteria:
    
 1. The description must be in plain English.
 2. The description must meet FAIR data principles.
 3. The description must provide the minimum information requested in the annotation of biochemical models (MIRIAM).
4. The descriptions must be terse, ensuring that every work is informative.
    
Details for FAIR data principles and MIRIAM compliance can be obtained from the provided files. Here is a summary:

MIRIAM guidelines: The Minimum Information Requested in the Annotation of Models (MIRIAM) defines standards for ensuring the accuracy, usability, and reusability of biochemical models. The key requirements are:

Reference description linkage: Each part, component and the interactions between parts and components must be explicitly associated with a unique reference description that explains the biological context, structure, and expected outcomes.
Simulation reproducibility: The description must include all relevant quantitative attributes (e.g., initial conditions, parameters, and kinetic expressions) required to simulate the circuit's behavior.
Annotation: The model must include clear annotations linking its components to established bioinformatics resources (e.g., Gene Ontology, KEGG, UniProt).

These guidelines ensure that descriptions are accurate, well-documented, and allow for reproduce-ability.

FAIR data principles: The FAIR data principles define standards for ensuring that data are Findable, Accessible, Interoperable, and Reusable. The key requirements are:

Findable: Data and metadata must have a globally unique and persistent identifier (e.g., DOI, accession number).
Metadata must clearly and thoroughly describe the data and include indexed terms to facilitate searchability.

Accessible: Data and metadata must be retrievable by a standard protocol (e.g., HTTP, FTP) that is open, free, and universally implementable.
Access to metadata should persist even if the data are no longer available.

Interoperable: Data and metadata must use a formal, accessible, and widely applicable language for knowledge representation (e.g., RDF, XML).
They should include references to other relevant data and use controlled vocabularies, ontologies, or thesauri to enable integration with other datasets.

Reusable: Data and metadata should include detailed provenance to trace their origins and usage.
They must comply with domain-relevant community standards and have clear licensing for reuse.



  Example inputs:

  INPUT EXAMPLE #1

    User description: "This part contains a reporter gene BBa_J04450, combined with OriTRP4. Used to test plasmid mobility...."
    
    - PDF of biochemical model, parameters, characteristics etc (optional)
    - Circuit/sequence

    Parts:
        BBa_J04450
            SO:0000804 - A region that is engineered.
            https://synbiohub.org/public/igem/BBa_J04450/1
            Gene: Highly engineered mutant of red fluorescent protein from Discosoma striata (coral)

        OriTRP4
            SO_0000724 - A region of a DNA molecule where transfer is initiated during the process of conjugation or mobilization.
            https://synbiohub.org/public/igem/BBa_K1439000/1


OUTPUT  EXAMPLE #1:

This circuit contains the reporter gene, RFP (https://synbiohub.org/public/igem/BBa_J04450/1) combined with the origin of transfer for the RP4-plasmid nic region, OriTRP4 (https://synbiohub.org/public/igem/BBa_K1439000/1). 
OriTRP4 was derived from the RP4 plasmid (https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=2503).


INPUT EXAMPLE #2:    

User Description: RFP and RBS behind Lamda Prm Promoter

Parts:
    BBa_S03873
        http://purl.obolibrary.org/obo/SO:0000804
        Engineered Region
        https://synbiohub.org/public/igem/BBa_S03873/1

        BBa_I12006
            Name: Prm+
            Promoter
            https://www.ebi.ac.uk/ols4/ontologies/so/classes?obo_id=SO%3A0000167
            https://synbiohub.org/public/igem/BBa_I12006/1

        BBa_B0034
            https://www.ebi.ac.uk/ols4/ontologies/so/classes?obo_id=SO%3A0000139
            Ribosomal Binding Site
            https://synbiohub.org/public/igem/BBa_B0034/1
        
        BBa_E1010
            Name: mRFP1
            CDS
            https://www.ebi.ac.uk/ols4/ontologies/so/classes?obo_id=SO%3A0000316
            Highly engineered mutant of red fluorescent protein from Discosoma striata (coral)
            https://synbiohub.org/public/igem/BBa_E1010/1

OUTPUT:

This circuit contains a Red fluorescent Protein (https://synbiohub.org/public/igem/BBa_E1010/1) and Ribosome Binding Site (https://synbiohub.org/public/igem/BBa_B0034/1) behind the modified Lambda Prm Promoter (https://synbiohub.org/public/igem/BBa_I12006/1).
This modified Lambda Promoter is induced by the lamda cl repressor (https://parts.igem.org/Part:BBa_C0051) instead of repressed by it.
When induced, the RFP is expressed. This allows the inducible Lamda Prm promoter to control the expression of an RFP.

Respond only with the description and no preamble or explanation.
"""

# Create agent

In [None]:

my_assistant = client.beta.assistants.create(
    instructions=SYSTEM_PROMPT,
    name="Circuit Describer",
    tools=[{"type": "code_interpreter"}],
    model="gpt-4o",
)
print(my_assistant)

# Example function use (search NCBI for gene information)

In [39]:
import requests
from bs4 import BeautifulSoup
import json
import time
from requests import RequestException

def search_ncbi_gene(gene_name):
    """
    Search NCBI Gene database for a given gene name.
    """
    base_url = "https://www.ncbi.nlm.nih.gov"
    search_url = f"{base_url}/gene/?term={gene_name}"
    response = requests.get(search_url)

    if response.status_code == 200:
        return parse_ncbi_gene_html(response.text)
    else:
        return {"error": f"Failed to access NCBI Gene. Status code: {response.status_code}"}


def parse_ncbi_gene_html(html_content):
    """
    Parse the NCBI Gene HTML content to extract gene-related information.
    """

    soup = BeautifulSoup(html_content, "html.parser")
    results = []

    table = soup.find("table", class_="jig-ncbigrid gene-tabular-rprt")
    if not table:
        return {"error": "Results table not found in the HTML."}

    rows = table.find_all("tr", class_="rprt")
    for row in rows:
        gene_data = {}
        
        name_cell = row.find("td", class_="gene-name-id")
        if name_cell:
            name_link = name_cell.find("a", href=True)
            if name_link:
                gene_data["name"] = name_link.get_text(strip=True)
                gene_data["link"] = f"https://www.ncbi.nlm.nih.gov{name_link['href']}"
            gene_id = name_cell.find("span", class_="gene-id")
            if gene_id:
                gene_data["id"] = gene_id.get_text(strip=True).replace("ID: ", "")
        
        # Extract the description
        description_cell = row.find_all("td")[1] if len(row.find_all("td")) > 1 else None
        if description_cell:
            gene_data["description"] = description_cell.get_text(strip=True)
        
        # Extract the location
        location_cell = row.find_all("td")[2] if len(row.find_all("td")) > 2 else None
        if location_cell:
            gene_data["location"] = location_cell.get_text(strip=True)
        
        # Extract aliases
        aliases_cell = row.find_all("td")[3] if len(row.find_all("td")) > 3 else None
        if aliases_cell:
            gene_data["aliases"] = aliases_cell.get_text(strip=True)
        
        if gene_data:
            results.append(gene_data)

    if results:
        return results
    else:
        return {"error": "No gene data found in the HTML."}

# Example Usage
gene_name = "lacI"
gene_results = search_ncbi_gene(gene_name)

if "error" in gene_results:
    print(gene_results["error"])
else:
    for gene in gene_results:
        print(f"Gene Name: {gene['name']}")
        print(f"Gene ID: {gene.get('id', 'N/A')}")
        print(f"Description: {gene.get('description', 'No description available')}")
        print(f"Location: {gene.get('location', 'No location available')}")
        print(f"Aliases: {gene.get('aliases', 'No aliases available')}")
        print(f"Link: {gene['link']}\n")

Gene Name: lacI
Gene ID: 945007
Description: DNA-binding transcriptional repressorLacI[Escherichia coli str. K-12 substr. MG1655]
Location: NC_000913.3 (366428..367510, complement)
Aliases: b0345, ECK0342
Link: https://www.ncbi.nlm.nih.gov/gene/945007

Gene Name: lacI
Gene ID: 75170321
Description: DNA-binding transcriptional repressorLacI[Shigella dysenteriae]
Location: 
Aliases: HUZ68_RS05005, HUZ68_04945
Link: https://www.ncbi.nlm.nih.gov/gene/75170321

Gene Name: lacI
Gene ID: 75202508
Description: DNA-binding transcriptional repressorLacI[Shigella boydii]
Location: 
Aliases: CEH00_RS02965, CEH00_02975
Link: https://www.ncbi.nlm.nih.gov/gene/75202508

Gene Name: lacI
Gene ID: 69758136
Description: DNA-binding transcriptional repressorLacI[Klebsiella quasivariicola]
Location: 
Aliases: LGM30_RS27025, LGM30_27075
Link: https://www.ncbi.nlm.nih.gov/gene/69758136

Gene Name: lacI
Gene ID: 60785781
Description: DNA-binding transcriptional repressorLacI[Aeromonas allosaccharophila]
Locat

In [None]:
# Configure the assistant to use the search_ncbi_gene function
tools = [
        {
            "type": "function",
            "function": {
                "name": "search_ncbi_gene",
                "description": "Search for gene information and links",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "query": {
                            "type": "string",
                            "description": "gene name or symbol",
                        },
                    },
                    "required": ["query"],
                },
            },
        }
    ]

my_assistant = client.beta.assistants.update(
    assistant_id=my_assistant.id,
    tools=tools,
)