In [30]:
import json
import hashlib
from collections import OrderedDict
from dotenv import dotenv_values
from pyexeggutor import (
    format_header,
    open_file_writer,
)


In [38]:
class LLMAnnotator(object):
    def __init__(
        self,
        api_key:str,
        organization_key:str,
        project_key:str,
        description:str=None,
    ):
        from openai import OpenAI
        self.description = description
        self.client = OpenAI(
            api_key=api_key,
            organization=organization_key,
            project=project_key,
        )
        self.history = OrderedDict()
        self.lookup = OrderedDict()

    @staticmethod
    def md5hash(string: str) -> str:
        """Compute a reproducible MD5 hash of a string."""
        return hashlib.md5(string.encode("utf-8")).hexdigest()

    def query(self, prompt:str, model="o3-mini", store=True):
        
        """
        Submits a prompt to the OpenAI API using the provided client and returns the response content as a string.

        Parameters:
            client: OpenAI client instance
                The initialized OpenAI client object.
            model: str
                The model to use for the completion (e.g., "o3-mini").
            prompt: str
                The user prompt to send to the OpenAI API.
            store: bool
                Whether to store the completion request (default is False).

        Returns:
            str: The content of the response from the OpenAI API.
        """
        id_hash = self.md5hash(prompt)
        if id_hash in self.history:
            return self.history[id_hash]
        else:
            # Submit the prompt to OpenAI
            completion = self.client.chat.completions.create(
                model=model,
                store=store,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            # Return the response content
            response = completion.choices[0].message
            content = response.content
            self.lookup[id_hash] = prompt
            self.history[id_hash] = content
            return content
        
    def to_json(self,filepath:str, sort_keys=True, indent=4):
        output = OrderedDict()
        for id_hash, prompt in self.lookup.items():
            content = self.history[id_hash]
            output[id_hash] = {"prompt":prompt, "content":content}
        with open_file_writer(filepath) as f:
            json.dump(output, f, sort_keys=sort_keys, indent=indent)
            
    # =======
    # Built-in
    # =======
    def __repr__(self):
        pad = 4
        header = format_header(f"{self.__class__.__name__}(Description:{self.description})", line_character="=")

        n = len(header.split("\n")[0])
        fields = [
            header,
            pad*" " + f"* number of queries: {len(self.history)}",
        ]

        return "\n".join(fields)
        
config = dotenv_values("/home/ec2-user/SageMaker/.openai")
llm = LLMAnnotator(
    api_key=config["api_key"],
    organization_key=config["organization_key"],
    project_key=config["project_key"],
)

proteins = [
 ('K02588', 'nitrogenase iron protein NifH'),
 ('K02586', 'nitrogenase molybdenum-iron protein alpha chain [EC:1.18.6.1]'),
 ('K02591', 'nitrogenase molybdenum-iron protein beta chain [EC:1.18.6.1]'),
 ('K00531', 'nitrogenase delta subunit [EC:1.18.6.1]'),
 ('K22896', 'vanadium-dependent nitrogenase alpha chain [EC:1.18.6.2]'),
 ('K22897', 'vanadium-dependent nitrogenase beta chain [EC:1.18.6.2]'),
 ('K22898', 'vanadium nitrogenase delta subunit [EC:1.18.6.2]'),
 ('K22899', 'vanadium nitrogenase iron protein')]
organisms = ["d__Bacteria; p__Cyanobacteriota; c__Cyanophyceae; o__Nostocales; f__Nostocaceae; g__Nostoc"]

llm.query(prompt=f"Can you descibe the metabolic and environmental context of a group of organisms" \
                 f"enriched in [{organisms}] genomes and [{proteins}] proteins?" \
                 f"Please make the response concise and to 100 words.",
          model="o3-mini",
)
# Nostoc, a filamentous cyanobacterium from the Nostocaceae family, thrives in diverse aquatic and terrestrial ecosystems, 
# often where nitrogen is limited. Its enrichment in nitrogenase proteins—including both molybdenum-iron and vanadium-dependent 
# enzymes—indicates a robust capacity for atmospheric nitrogen fixation. This metabolic trait is crucial in converting inert N₂ 
# into biologically available forms, supporting both self-growth and symbiotic relationships with plants and other organisms. 
# By performing nitrogen fixation under microaerobic or anoxic conditions within its usually oxygen-rich habitat, Nostoc plays a 
# vital role in ecosystem nutrient cycling and soil fertility, contributing significantly to primary productivity in nutrient-poor 
# environments.
llm.to_json("llm_annotations.json")


In [37]:
llm.lookup

OrderedDict([('80c36efbc158147ed1c2c6a92492afa2',
              "Can you descibe the metabolic and environmental context of a group of organismsenriched in [['d__Bacteria; p__Cyanobacteriota; c__Cyanophyceae; o__Nostocales; f__Nostocaceae; g__Nostoc']] genomes and [[('K02588', 'nitrogenase iron protein NifH'), ('K02586', 'nitrogenase molybdenum-iron protein alpha chain [EC:1.18.6.1]'), ('K02591', 'nitrogenase molybdenum-iron protein beta chain [EC:1.18.6.1]'), ('K00531', 'nitrogenase delta subunit [EC:1.18.6.1]'), ('K22896', 'vanadium-dependent nitrogenase alpha chain [EC:1.18.6.2]'), ('K22897', 'vanadium-dependent nitrogenase beta chain [EC:1.18.6.2]'), ('K22898', 'vanadium nitrogenase delta subunit [EC:1.18.6.2]'), ('K22899', 'vanadium nitrogenase iron protein')]] proteins?Please make the response concise and to 100 words.")])

In [39]:
llm.history

OrderedDict([('80c36efbc158147ed1c2c6a92492afa2',
              'Nostoc, a genus within the Nostocaceae, thrives in nutrient-limited freshwater, terrestrial, or symbiotic environments. These Cyanobacteria perform oxygenic photosynthesis and are notable for nitrogen fixation. Their enriched nitrogenase genes include both molybdenum- and vanadium-dependent enzymes, equipping them to convert inert atmospheric N₂ into bioavailable ammonia, especially under low-nitrogen conditions. This metabolic versatility supports growth in diverse ecosystems, contributing to nitrogen cycling and soil fertility. Their ability to balance photosynthesis and nitrogen fixation—often via specialized heterocyst cells—reflects adaptation to fluctuating environmental oxygen and nutrient levels.')])

In [28]:
llm.to_json("test.json")

In [33]:
llm

LLMAnnotator(Description:None)
    * number of queries: 1

In [None]:
# # OpenAI
# # !pip install dotenv
# # from dotenv import load_dotenv
# from openai import OpenAI

# openai_api_key = open("/home/ec2-user/SageMaker/.openai").read().strip().split("=", maxsplit=1)[-1]

# # client = OpenAI(
# #   api_key=openai_api_key,
# # )
# client = OpenAI(
#     api_key=openai_api_key,
#     organization='org-Ez60DuNFqA22CtJjM2PpXQxa',
#     project='proj_qCFyUKcDinIaUNIwuHRtNxpL',
# )

# # completion = client.chat.completions.create(
# #   model="gpt-4o-mini",
# #   store=True,
# #   messages=[
# #     {"role": "user", "content": "write a haiku about ai"}
# #   ]
# # )
# # print(completion.choices[0].message)

# def submit_openai_prompt(client, prompt, model="gpt-4o-mini", store=False):
#     """
#     Submits a prompt to the OpenAI API using the provided client and returns the response content as a string.
    
#     Parameters:
#         client: OpenAI client instance
#             The initialized OpenAI client object.
#         model: str
#             The model to use for the completion (e.g., "gpt-4o-mini").
#         prompt: str
#             The user prompt to send to the OpenAI API.
#         store: bool
#             Whether to store the completion request (default is False).
    
#     Returns:
#         str: The content of the response from the OpenAI API.
#     """
#     try:
#         # Submit the prompt to OpenAI
#         completion = client.chat.completions.create(
#             model=model,
#             store=store,
#             messages=[
#                 {"role": "user", "content": prompt}
#             ]
#         )
#         # Return the response content
#         return completion.choices[0].message
#     except Exception as e:
#         # Handle exceptions and provide meaningful error messages
#         print(f"An error occurred: {e}")
#         return None
