In [49]:
from io import StringIO
import json
import pandas as pd
from pyexeggutor import read_pickle
from dotenv import dotenv_values
from nichespace.llm import LLMAnnotator

In [60]:
# Load credentials
config = dotenv_values("/home/ec2-user/SageMaker/.openai")

# Setup client
llm = LLMAnnotator(**config, description="EmbeddingAnnotator Contextualizer")
llm

LLMAnnotator(Description:EmbeddingAnnotator Contextualizer)
    * number of queries: 0

In [56]:
df_meta_keggortholog = pd.read_csv("/home/ec2-user/SageMaker/s3/newatlantis-raw-veba-db-prod/VEBA/VDB_v8.1/Annotate/KOfam/kegg-ortholog_metadata.tsv", sep="\t", index_col=0)
keggortholog_to_name = df_meta_keggortholog["definition"]
annotator = read_pickle(f"../data/training/completeness_gte90.contamination_lt5/NAL-GDB_MNS_v2.SLC-MFC.EmbeddingAnnotator.pkl")


In [61]:
prompts = dict()
prompts["annotate_embeddings[kegg_orthogs=True,genome_taxonomy=False]"] = """
You are an AI assistant tasked with analyzing a dictionary of KEGG orthologs (features) and their corresponding weights. 
These weights indicate the predictive capacity of each ortholog for a specific dimension in a diffusion map embedding. 
Your goal is to assign high-level metabolic or ecological annotations to this embedding dimension, providing insight into
the metabolic niche of organisms with high magnitude along this dimension.

Here is the dictionary of features (KEGG orthologs) and their weights:
<feature_weight_dict>
%s
</feature_weight_dict>

To complete this task, follow these steps:
1. Examine the features and weights in the dictionary. Pay attention to the magnitude and sign of the weights. 
Higher magnitude will mean these proteins will be the most influential in predicting the diffusion coordinate
while the sign indicates a separation along some data-driven axis likely representing complex metabolic patterns.
2. Use your knowledge of KEGG pathways and biological processes to understand the function of these orthologs 
and their roles in metabolic pathways.
3. Contextualize your research by searching for common themes or related processes among the weighted features.
4. Based on your analysis, formulate a high-level metabolic or ecological annotation for this embedding dimension. 
Consider what type of organisms or metabolic processes might be associated with high magnitude along this dimension.
Provide your answer in json format that be read into a Python dictionary without additional parsing:
{
"summary":[Here, provide a consise analysis of your findings in and how the features relate to each other metabolically and ecologically.],
"metabolic_context":[Provide a concise, high-level annotation of the metabolic context for the embedding dimension based on your analysis.],
"ecological_context":[Provide a concise, high-level annotation of the ecological context for the embedding dimension based on your analysis.]
"justification":[Explain your reasoning for the annotation, citing specific features and their weights from the dictionary. Discuss any potential limitations or uncertainties in your interpretation.]
}
Remember to think critically about the biological significance of these features and their weights. Consider how they might work together in metabolic pathways or ecological processes. 
If you're unsure about certain aspects, acknowledge this uncertainty in your justification.
Your goal is to provide a well-reasoned, biologically meaningful annotation that could help researchers understand the metabolic or ecological significance of this embedding dimension.
"""

In [57]:
info = annotator.feature_weights_["n35"].sort_values(ascending=False)
info.index = info.index.map(lambda x: (x,keggortholog_to_name[x]))
info

K27188  [DsrC]-trisulfide reductase subunit K [EC:1.8.5.10]                                                                                    0.208530
K01499  methenyltetrahydromethanopterin cyclohydrolase [EC:3.5.4.27]                                                                           0.124581
K01690  phosphogluconate dehydratase [EC:4.2.1.12]                                                                                             0.068363
K08093  3-hexulose-6-phosphate synthase [EC:4.1.2.43]                                                                                          0.067118
K02502  ATP phosphoribosyltransferase regulatory subunit                                                                                       0.044294
K17227  sulfur-oxidizing protein SoxZ                                                                                                          0.041635
K20881  GMP/IMP 5'-nucleotidase [EC:3.1.3.-]                                            

In [62]:
prompt = prompts["annotate_embeddings[kegg_orthogs=True,genome_taxonomy=False]"]%(str(info.to_dict()))
response = llm.query(prompt)

In [65]:
json.load(StringIO(response))

{'summary': 'The weighted features point to a metabolic dimension that integrates elements of sulfur compound transformations with central carbon and nitrogen metabolism. High-weight proteins such as [DsrC]-trisulfide reductase (K27188) and the sulfur-oxidizing SoxZ (K17227) indicate a significant role for sulfur redox processes, while enzymes like methenyltetrahydromethanopterin cyclohydrolase (K01499) and those involved in formaldehyde fixation (e.g., 3-hexulose-6-phosphate synthase, K08093) hint at specialized carbon assimilation pathways operative in anaerobic or microaerophilic conditions.',
 'metabolic_context': 'Dimension associated with sulfur cycling coupled to alternative carbon assimilation pathways.',
 'ecological_context': 'Likely reflects microbes adapted to redox‐gradient environments such as anoxic or microoxic sediments, hydrothermal systems, or niches where sulfur compounds serve as key energy sources.',
 'justification': 'The highest weighted feature, [DsrC]-trisulfi

In [None]:
# prompts = dict()
# prompts["annotate_embeddings[kegg_orthogs=True,genome_taxonomy=False]"] = """
# You are an AI assistant tasked with analyzing a dictionary of KEGG orthologs (features) and their corresponding weights. 
# These weights indicate the predictive capacity of each ortholog for a specific dimension in a diffusion map embedding. 
# Your goal is to assign high-level metabolic or ecological annotations to this embedding dimension, providing insight into
# the metabolic niche of organisms with high magnitude along this dimension.

# Here is the dictionary of features (KEGG orthologs) and their weights:
# <feature_weight_dict>
# {}
# </feature_weight_dict>

# To complete this task, follow these steps:
# 1. Examine the features and weights in the dictionary. Pay attention to the magnitude and sign of the weights.
# 2. Identify the top 5-10 features with the highest absolute weight values. These are likely the most influential in predicting the diffusion coordinate.
# 3. Research these top KEGG orthologs. Use your knowledge of KEGG pathways and biological processes to understand the function of these orthologs and their roles in metabolic pathways.
# 4. Synthesize the information from your research. Look for common themes or related processes among the top features.
# 5. Based on your analysis, formulate a high-level metabolic or ecological annotation for this embedding dimension. Consider what type of organisms or metabolic processes might be associated with high magnitude along this dimension.
# Provide your answer in the following format:
# <analysis>
# [Here, provide a detailed analysis of your findings, including the top features you identified, their functions, and how they relate to each other.]
# </analysis>
# <annotation>
# [Provide a concise, high-level annotation for the embedding dimension based on your analysis.]
# </annotation>
# <justification>
# [Explain your reasoning for the annotation, citing specific features and their weights from the dictionary. Discuss any potential limitations or uncertainties in your interpretation.]
# </justification>
# Remember to think critically about the biological significance of these features and their weights. Consider how they might work together in metabolic pathways or ecological processes. If you're unsure about certain aspects, acknowledge this uncertainty in your justification.
# Your goal is to provide a well-reasoned, biologically meaningful annotation that could help researchers understand the metabolic or ecological significance of this embedding dimension.
# """