# 基于LLM bagging + 可解释
> zhenkun.shi@tib.cas.cn   
> 2025-01-14 

## 1. 导入必要的包

In [1]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../../')
from config import conf as cfg
import pandas as pd
import json
import plotly.graph_objects as go
from tools import btools
from tqdm import tqdm
import rxnrecer as production

from modules.llm import Chat as llmchat
import tools.bioFunctionLib as bfl
from IPython.display import HTML
from pandarallel import pandarallel # 导入pandaralle

pandarallel.initialize(progress_bar=False)
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 192 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. 加载数据

In [2]:
ds_test = pd.read_feather(cfg.FILE_DS_TEST)
ds_test.head(3)

Unnamed: 0,uniprot_id,seq,reaction_id,ec_number,functionCounts,ec_specific_level,isenzyme,label
0,A9JLI2,MLGLQIFTLLSIPTLLYTYEIEPLERTSTPPEKEFGYWCTYANHCR...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A9JLI3,MRFFSYLGLLLAGLTSLQGFSTDNLLEEELRYWCQYVKNCRFCWTC...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,A9JLI5,MLVIFLGILGLLANQVLGLPTQAEGHLRSTDNPPQEELGYWCTYME...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## 3. RXNRECer EXP

In [35]:
expdata = ds_test.head(100)
res_ecrecer = production.step_by_step_prediction(input_data=expdata[['uniprot_id', 'seq']], Ensemble=True, batch_size=100)

Step 1: Preparing input data
Step 2: Loading predictive model
Use device: cuda
Step 3: Running prediction on 100 proteins


Predicting reactions: 100%|██████████| 50/50 [00:03<00:00, 13.79it/s]


Transferred model to GPU


100%|██████████| 100/100 [00:03<00:00, 27.43it/s]
Processing Sequences: 100%|██████████| 100/100 [00:01<00:00, 68.00it/s]
Processing Sequences: 100%|██████████| 100/100 [00:01<00:00, 67.94it/s]
100%|██████████| 1/1 [02:00<00:00, 120.85s/it]


In [36]:
res_ecrecer

Unnamed: 0,input_id,RXNRECer,RXNRECer_with_prob
0,A9JLI2,-,{'-': 0.999997}
1,A9JLI3,-,{'-': 0.999998}
2,A9JLI5,-,{'-': 1.0}
3,A9JLI7,-,{'-': 0.999997}
4,B5KVH4,-,{'-': 0.999998}
...,...,...,...
96,Q572D3,-,{'-': 0.999996}
97,D0NPN8,-,{'-': 0.999996}
98,E2DWQ7,-,{'-': 0.999996}
99,P0DOO9,-,{'-': 1.0}


In [39]:
res_ecrecer = res_ecrecer.rename(columns={'input_id':'uniprot_id'})
res_ecrecer = res_ecrecer.merge(expdata[['uniprot_id', 'seq', 'reaction_id']], on='uniprot_id', how='left').rename(columns={'reaction_id':'rxn_groundtruth'})
res_ecrecer

Unnamed: 0,uniprot_id,RXNRECer,RXNRECer_with_prob,seq,rxn_groundtruth
0,A9JLI2,-,{'-': 0.999997},MLGLQIFTLLSIPTLLYTYEIEPLERTSTPPEKEFGYWCTYANHCR...,-
1,A9JLI3,-,{'-': 0.999998},MRFFSYLGLLLAGLTSLQGFSTDNLLEEELRYWCQYVKNCRFCWTC...,-
2,A9JLI5,-,{'-': 1.0},MLVIFLGILGLLANQVLGLPTQAEGHLRSTDNPPQEELGYWCTYME...,-
3,A9JLI7,-,{'-': 0.999997},MLVIILGVIGLLANQVLGLPTQAGGHLRSTDNPPQEELGYWCTYME...,-
4,B5KVH4,-,{'-': 0.999998},MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE...,-
...,...,...,...,...,...
96,Q572D3,-,{'-': 0.999996},MRLAIMLSATAVAINFATSSAIDQTKVLVYGTPAHYIHDSAGRRLL...,-
97,D0NPN8,-,{'-': 0.999996},MRLAIMLSATAVAINFATSSAIDQTKVLVYGTPAHYIHDSAGRRLL...,-
98,E2DWQ7,-,{'-': 0.999996},MRLAIMLSATAVAINFATCSAIDQTKVLVYGTPAHYIHDSAGRRLL...,-
99,P0DOO9,-,{'-': 1.0},MGNKNIKPSKENRLSILYKDRMDSFKRGSWATSSFREKSHATIQRF...,-


## 4. Load RXN

In [111]:
rxn = pd.read_feather(cfg.FILE_RHEA_REACTION)
rxn.head(3)

Unnamed: 0,reaction_id,equation,chebi_id,ec_number,equation_chebi,equation_smiles
0,RHEA:22636,dCTP + H2O = dCMP + diphosphate + H(+),CHEBI:61481;CHEBI:15377;CHEBI:57566;CHEBI:3301...,EC:3.6.1.9;EC:3.6.1.12;EC:3.6.1.65,CHEBI:61481 + CHEBI:15377 = CHEBI:57566 + CHEB...,Nc1ccn([C@H]2C[C@H](O)[C@@H](COP([O-])(=O)OP([...
1,RHEA:22640,NADP(+) + sphinganine = 3-oxosphinganine + H(+...,CHEBI:58349;CHEBI:57817;CHEBI:58299;CHEBI:1537...,EC:1.1.1.102,CHEBI:58349 + CHEBI:57817 = CHEBI:58299 + CHEB...,NC(=O)c1ccc[n+](c1)[C@@H]1O[C@H](COP([O-])(=O)...
2,RHEA:22644,O2 + protopine + reduced [NADPH--hemoprotein r...,CHEBI:15379;CHEBI:16415;CHEBI:57618;CHEBI:1710...,EC:1.14.14.98,CHEBI:15379 + CHEBI:16415 + CHEBI:57618 = CHEB...,O=O.CN1CCc2cc3OCOc3cc2C(=O)Cc2ccc3OCOc3c2C1.Cc...


In [170]:
res_ecrecer.iloc[11:25]

Unnamed: 0,uniprot_id,RXNRECer,RXNRECer_with_prob,seq,rxn_groundtruth
11,P0DTH6,-,{'-': 0.999998},MKKIFLTMTPLPYMNHLPENHYFLSLTVVISIIHLFTTCVPQHNHS...,-
12,Q7XQ85,RHEA:21744;-,"{'RHEA:21744': 0.997593, '-': 0.985379}",MAYQGIDLLSTKAAGDDHGENSSYFDGWKAYDTNPFDLRHNRGGVI...,RHEA:21744
13,A0A0P0WIY3,RHEA:21744;-,"{'RHEA:21744': 0.990111, '-': 0.7777}",MVGRMLSSPEPTLSTMAMSAAHGEDSPYFAGWRAYDEDPYDPITNP...,RHEA:21744
14,Q5W6F9,RHEA:21744;-,"{'RHEA:21744': 0.989836, '-': 0.7777}",MGVKLLADGCAGASSSPALSRVATSAAHGEGSPYFAGWKAYDEDPY...,RHEA:21744
15,A0A0P0UZP7,RHEA:21744;-,"{'RHEA:21744': 0.979187, '-': 0.7777}",MIMSGFHIGIYTSICLYIPLPHLEPWIISSHTPKNLNLLDCLLYCS...,RHEA:21744
16,Q9SNN8,RHEA:22104;-;RHEA:21744,"{'RHEA:22104': 0.980896, '-': 0.980291, 'RHEA:...",MRRSGNGGAAKKKKKRSASAASERRPRADGGMRIVVPLQGVVQGRG...,RHEA:21744
17,L0TBY6,-;RHEA:20049,"{'-': 0.99999, 'RHEA:20049': 0.989305}",MTHREELLPPMKWDAWGDPAAAKPLSDGVRSLLKQVVGLADSEQPE...,-
18,M5AWY0,RHEA:34415;-;RHEA:56596,"{'RHEA:34415': 0.999452, '-': 0.7777, 'RHEA:56...",MQAGFFHTPYNLPTRTARQMFDWSLKLAQVCDEAGFADFMIGEHST...,RHEA:34415
19,Q6I621,-,{'-': 0.999998},MWKGFLSKLPRKTSASGRGADLDSGQCSNGAGNGNPIQRTSSCGSI...,-
20,Q88FI7,RHEA:18049;RHEA:23352;RHEA:10212;RHEA:12268,"{'RHEA:18049': 0.99144, 'RHEA:23352': 0.988573...",MNQESISQSIAIVHPITLSHGRNAEVWDTDGKRYIDFVGGIGVLNL...,RHEA:12601;RHEA:10212


In [193]:
import json

def get_rxn_detail(rxn_id, rxn_bank):
    if rxn_id == '-':
        return {
            'reaction id': '-',
            'reaction equation': '-'
        }
    
    rxn_record = rxn_bank[rxn_bank.reaction_id == rxn_id]
    
    if rxn_record.empty:
        return {}  # Avoid errors if the record is missing

    rxn_record = rxn_record.iloc[0]  # Take the first match

    return {
        'reaction id': rxn_record.reaction_id,
        'reaction equation': rxn_record.equation,
        'reaction equation in ChEBI format': rxn_record.equation_chebi,
        'reaction equation in SMILES format': rxn_record.equation_smiles,
        'reaction associated Enzyme Commission Number': rxn_record.ec_number
    }

def make_make_query(uniprot_id, seq, rxn_id, rxn_bank, prompt):
    if not rxn_id or cfg.SPLITER not in rxn_id:
        return {}

    rxns = rxn_id.split(cfg.SPLITER)



    query_info = {
        'protein information': {
            'uniprot id': uniprot_id,
            'protein amino acid sequence': seq
        },
        'reaction information': {
            f'candidate reaction {i+1}': get_rxn_detail(rxn, rxn_bank)
            for i, rxn in enumerate(rxns)
        }
    }

    llm_query = prompt  + json.dumps(query_info, indent=4, ensure_ascii=False)

    return llm_query


In [243]:
prompt0 = """You are a biochemical expert. Your task is to analyze a protein and determine which reaction(s) it is most likely to catalyze based on the provided data.

### Guidelines:
1. **Input Data:**
- **Protein Information:** Includes the Uniprot ID and the protein's amino acid sequence.
- **Reaction Information:** A list of candidate reactions. Each reaction is provided in:
    - Standard reaction equation
    - Reaction equation in ChEBI format
    - Reaction equation in SMILES format

2. **Sequence Analysis & Functional Inference:**
- **Basic Properties:** Consider the protein's length, molecular weight, and amino acid composition to see if they match known enzyme classes.
- **Domain & Active Site Identification:** Look for conserved domains and key catalytic residues (or cofactor-binding motifs) that are critical for catalysis.
- **Structural Considerations:** Evaluate whether the predicted secondary/tertiary structure supports the formation of an active site suitable for the reaction’s substrate.
- **Database & Homology Evidence:** Use similarities to annotated proteins and known enzyme families to infer likely catalytic activity.

3. **Handling Non-Catalytic Cases:**
- A candidate reaction denoted by a single dash ("-") indicates that the protein is predicted not to catalyze that particular reaction.
- **Important:** If all candidate reactions are marked with a dash, or if your analysis suggests that none of the reactions with valid IDs are likely catalyzed by the protein, then include the dash candidate reaction in the output to indicate that the protein is not catalytically active for any provided reaction.

4. **Multiple Reactions:**
- Proteins may have multiple active sites. If applicable, select more than one reaction.

5. **Ranking:**
- Rank the selected reactions by likelihood (1 being the most likely).

6. **Justification:**
- Provide a brief explanation for each selection, citing relevant sequence features (e.g., conserved domains, active site residues, structural motifs) and reaction details.

### Expected Output (in JSON format):
```json
[
    {
        "reaction_id": "xxx",
        "selected": "yes" or "no",
        "rank": <integer>,         // 1 for the most likely, 2 for the next, etc.
        "confidence": <number>,      // Value between 0 and 1
        "reason": "Explanation based on sequence and reaction data."
    },
    ...
]
"""

prompt1 = """
You are a biochemical expert. Given a protein sequence and a set of potential catalyzed reactions, please analyze and determine the most likely catalyzed reactions. 
    The information is provided in JSON format.
    Important note: If a reaction is represented by a single dash ("-"), it means the protein cannot catalyze a reaction. 
    Please consider this in your analysis.
    
Guidelines:
    1. **Handling Non-Catalytic Cases:**
    - A candidate reaction denoted by a single dash ("-") indicates that the protein is predicted not to catalyze that particular reaction.
    - **Important:** If all candidate reactions are marked with a dash, or if your analysis suggests that none of the reactions with valid IDs are likely catalyzed by the protein, then include the dash candidate reaction in the output to indicate that the protein is not catalytically active for any provided reaction.

    2. **Multiple Reactions:**
    - Proteins may have multiple active sites. If applicable, select more than one reaction.

    3. **Ranking:**
    - Rank the selected reactions by likelihood (1 being the most likely).

    4. **Justification:**
    - Provide a brief explanation for each selection, citing relevant sequence features (e.g., conserved domains, active site residues, structural motifs) and reaction details.

    
Expected Output (in JSON format):
    ```json
    [
        {
            "reaction_id": "xxx",
            "selected": "yes" or "no",
            "rank": <integer>,         // 1 for the most likely, 2 for the next, etc.
            "confidence": <number>,      // Value between 0 and 1
            "reason": "Explanation based on sequence and reaction data."
        },
        ...
    ]
    
"""

prompt2 = """
You are a biochemical expert. Your task is to analyze a protein and determine which reaction(s) it is most likely to catalyze based on the provided data in JSON format.

Important Notes:
1. If a reaction is represented by a single dash ("-"), it means the protein is predicted not to catalyze that reaction. However, if all candidate reactions are marked with a dash, or if your analysis suggests that none of the reactions with valid IDs are likely catalyzed by the protein, then include the dash candidate reaction in your output to indicate that the protein is not catalytically active for any provided reaction.
2. Proteins may have multiple active sites. If applicable, select more than one reaction.
3. For each reaction, provide a brief explanation citing relevant sequence features (e.g., conserved domains, active site residues, structural motifs) and reaction details.

Output your final answer in JSON format using the following template:
{
    "reaction_id": "xxx",
    "selected": "yes" or "no",
    "rank": <integer>,         // 1 for the most likely, 2 for the next, etc.
    "confidence": <number>,      // A value between 0 and 1
    "reason": "Explanation based on sequence and reaction data."
}

Input:
"""


prompt3 = """
You are a biochemical expert. Your task is to analyze a protein and determine which reaction(s) it is most likely to catalyze based on the provided data in JSON format.

Guidline:
1. **Structural Chemistry Analysis**
   - Parse SMILES to identify reaction centers (e.g., bond cleavage sites)
   $$ \text{Example: } \text{C-S bond in } \mathrm{CHEBI:59789} \rightarrow \mathrm{CH_3S^- + adenosyl} $$
   - Map ChEBI IDs to functional groups (e.g., thioether in CHEBI:59789)

2. **Sequence-Structure Mapping**
   - Detect catalytic triads using PROSITE patterns:
     $$ \text{Cys-X-X-Cys} \Rightarrow \text{Sulfur metabolism} $$
     $$ \text{Ser-Lys-Asp} \Rightarrow \text{Hydrolase activity} $$
   - Verify cofactor binding with PROSITE PDOC00005

3. **Dynamic Compatibility Check**
   - Calculate active site volume using:
     $$ V_{\text{site}} = \frac{4}{3}\pi r_{\text{substrate}}^3 \pm 15\% $$
   - Validate stereochemical constraints from SMILES chirality tags (@, @@)

4. **Conflict Resolution Protocol**
   - Prioritize reaction chemistry over EC numbers when conflicts exist
   - Use UniProtKB/Swiss-Prot annotations (e.g., FT ACT_SITE) as tiebreaker

**Important Notes:**
- If a reaction is represented by a single dash ("-"), it means the protein is predicted not to catalyze that reaction. However, if all candidate reactions are marked with a dash, or if your analysis suggests that none of the reactions with valid IDs are likely catalyzed by the protein, then include the dash candidate reaction in your output to indicate that the protein is not catalytically active for any provided reaction.
- Proteins may have multiple active sites. If applicable, select more than one reaction.
- For each reaction, provide a brief explanation citing relevant sequence features (e.g., conserved domains, active site residues, structural motifs) and reaction details.

Output your final answer in JSON format using the following template:
{
    "reaction_id": "xxx",
    "selected": "yes" or "no",
    "rank": <integer>,         // 1 for the most likely, 2 for the next, etc.
    "confidence": <number>,      // A value between 0 and 1
    "reason": "Explanation based on sequence and reaction data."
}

Input:

"""


prompt3 = """
You are a biochemical expert. Your task is to analyze a protein and determine which reaction(s) it is most likely to catalyze based on the provided data in JSON format.

Guidline:
1. **Structural Chemistry Analysis**
   - Parse SMILES to identify reaction centers (e.g., bond cleavage sites)
   $$ \text{Example: } \text{C-S bond in } \mathrm{CHEBI:59789} \rightarrow \mathrm{CH_3S^- + adenosyl} $$
   - Map ChEBI IDs to functional groups (e.g., thioether in CHEBI:59789)

2. **Sequence-Structure Mapping**
   - Detect catalytic triads using PROSITE patterns:
     $$ \text{Cys-X-X-Cys} \Rightarrow \text{Sulfur metabolism} $$
     $$ \text{Ser-Lys-Asp} \Rightarrow \text{Hydrolase activity} $$
   - Verify cofactor binding with PROSITE PDOC00005

3. **Dynamic Compatibility Check**
   - Calculate active site volume using:
     $$ V_{\text{site}} = \frac{4}{3}\pi r_{\text{substrate}}^3 \pm 15\% $$
   - Validate stereochemical constraints from SMILES chirality tags (@, @@)

4. **Conflict Resolution Protocol**
   - Prioritize reaction chemistry over EC numbers when conflicts exist
   - Use UniProtKB/Swiss-Prot annotations (e.g., FT ACT_SITE) as tiebreaker

**Important Notes:**
- If a reaction is represented by a single dash ("-"), it means the protein is predicted not to catalyze that reaction. However, if all candidate reactions are marked with a dash, or if your analysis suggests that none of the reactions with valid IDs are likely catalyzed by the protein, then include the dash candidate reaction in your output to indicate that the protein is not catalytically active for any provided reaction.
- Proteins may have multiple active sites. If applicable, select more than one reaction.
- For each reaction, provide a brief explanation citing relevant sequence features (e.g., conserved domains, active site residues, structural motifs) and reaction details.

Output your final answer in JSON format using the following template:
{
    "reaction_id": "xxx",
    "selected": "yes" or "no",
    "rank": <integer>,         // 1 for the most likely, 2 for the next, etc.
    "confidence": <number>,      // A value between 0 and 1
    "reason": "Explanation based on sequence and reaction data."
}

Input:

"""

prompt4 = """
You are a biochemical expert. Your task is to analyze a protein and determine which reaction(s) it is most likely to catalyze based on the provided data in JSON format.

Guidline:
1. **Omni-enzyme Structural Profiling**
   - Perform topological analysis of SMILES using:
     $$ \text{Graph neural networks for reaction center detection} $$
     $$ \text{Electrostatic potential mapping (EPM) for charge complementarity} $$
   - Cross-reference ChEBI IDs with MetaCyc reaction pathways:
     $$ \forall R \in \text{Reactions}, \exists \text{Substrate-Enzyme PCMatrix} \geq 0.7 $$

2. **Quantum Biochemical Evaluation**
   - Calculate catalytic proficiency using:
     $$ k_{\text{cat}}/K_m = \frac{k_B T}{h} e^{-\Delta G^\ddagger/RT} $$
   - Simulate transition states with:
     $$ \text{QM/MM modeling of active site frontier orbitals} $$
     $$ \text{Reaction coordinate analysis (IRC)} $$

3. **Evolutionary Dynamics Integration**
   - Calculate phylogenetic catalytic propensity:
     $$ P_{\text{cat}} = \frac{\sum \text{Consurf scores}}{\text{Active site residues}} \times \text{EC_specific_phyletic_pattern} $$
   - Detect convergent evolution signatures through:
     $$ \text{Parallel substitution analysis in catalytic residues} $$
     $$ \text{Structural phylogeny using DALI Z-scores} $$

4. **Multi-scale Mechanistic Validation**
   - Verify allosteric regulation potential:
     $$ \text{Ensemble docking across conformational states} $$
     $$ \text{Principal component analysis of MD trajectories} $$
   - Cross-validate with omics data:
     $$ \text{Co-expression with metabolic pathway genes (r ≥ 0.85)} $$
     $$ \text{Proteomic colocalization evidence} $$

**Important Notes:**
- If a reaction is represented by a single dash ("-"), it means the protein is predicted not to catalyze that reaction. However, if all candidate reactions are marked with a dash, or if your analysis suggests that none of the reactions with valid IDs are likely catalyzed by the protein, then include the dash candidate reaction in your output to indicate that the protein is not catalytically active for any provided reaction.
- Proteins may have multiple active sites. If applicable, select more than one reaction.
- For each reaction, provide a brief explanation citing relevant sequence features (e.g., conserved domains, active site residues, structural motifs) and reaction details.

Output your final answer in JSON format using the following template:
{
    "reaction_id": "xxx",
    "selected": "yes" or "no",
    "rank": <integer>,         // 1 for the most likely, 2 for the next, etc.
    "confidence": <number>,    // A value between 0 and 1
    "reason": "Explanation based on sequence and reaction data."
}

Input:
"""


prompt5 = """
You are a biochemical expert. Your task is to analyze a protein and determine which reaction(s) it is most likely to catalyze based on the provided data in JSON format.

Guidline:
1. **Literature Evidence Triangulation** ★最高优先级
   a. UniProt/Sequence-based search:
     $$ \text{IF } \exists \text{ PMIDs } (P|E) \rightarrow \text{ReactID} \text{ THEN confidence} += 0.4 $$
   b. Reaction-centric search:
     $$ \text{Match(Equation|ChEBI|SMILES) → PMIDs → Enzyme } \cap \text{Input} $$
   c. EC number validation:
     $$ \text{Validate EC}_{obs} = \frac{\text{PMID}_{\text{EC-match}}}{\text{PMID}_{\text{EC-total}}} \geq 0.6 $$

2. **Structural Catalytic Competence**
   - SMILES拓扑指纹匹配:
     $$ \text{Tanimoto} \geq 0.85 \text{ with Catalophore model} $$
   - 电荷互补性验证:
     $$ \Delta E_{\text{elec}} = \sum \frac{q_i q_j}{4\pi\epsilon r_{ij}} \leq -15 \text{ kJ/mol} $$

3. **Quantum Transition State Analysis**
   - 催化效率计算:
     $$ \ln(k_{\text{cat}}/K_m) \propto -\Delta G^\ddagger_{\text{calc}} + \text{文献修正项} $$
   - 轨道相互作用验证:
     $$ \text{HOMO-LUMO gap匹配度} \geq 82\% $$

4. **Evolutionary & Omics Validation**
   - 系统发育可信度:
     $$ P_{\text{cat}} = \text{Consurf} \times \text{EC}_{phylo} \geq 7.5 $$
   - 组学相关性:
     $$ r_{\text{coex}} \geq 0.9 \text{ AND } \text{LocalScore} \geq 0.8 $$

**Important Notes:**
- If a reaction is represented by a single dash ("-"), it means the protein is predicted not to catalyze that reaction. However, if all candidate reactions are marked with a dash, or if your analysis suggests that none of the reactions with valid IDs are likely catalyzed by the protein, then include the dash candidate reaction in your output to indicate that the protein is not catalytically active for any provided reaction.
- Proteins may have multiple active sites. If applicable, select more than one reaction.
- For each reaction, provide a brief explanation citing relevant sequence features (e.g., conserved domains, active site residues, structural motifs) and reaction details.

Output your final answer in JSON format using the following template:
{
    "reaction_id": "xxx",
    "selected": "yes" or "no",
    "rank": <integer>,         // 1 for the most likely, 2 for the next, etc.
    "confidence": <number>,    // A value between 0 and 1
    "reason": "Explanation based on sequence and reaction data."
}

Input:

"""

In [238]:
res_ecrecer[res_ecrecer.RXNRECer!=res_ecrecer.rxn_groundtruth]

Unnamed: 0,uniprot_id,RXNRECer,RXNRECer_with_prob,seq,rxn_groundtruth
5,C0HLM8,-;RHEA:15801,"{'-': 0.999989, 'RHEA:15801': 0.940825}",GLEETYCSMKIK,-
12,Q7XQ85,RHEA:21744;-,"{'RHEA:21744': 0.997593, '-': 0.985379}",MAYQGIDLLSTKAAGDDHGENSSYFDGWKAYDTNPFDLRHNRGGVI...,RHEA:21744
13,A0A0P0WIY3,RHEA:21744;-,"{'RHEA:21744': 0.990111, '-': 0.7777}",MVGRMLSSPEPTLSTMAMSAAHGEDSPYFAGWRAYDEDPYDPITNP...,RHEA:21744
14,Q5W6F9,RHEA:21744;-,"{'RHEA:21744': 0.989836, '-': 0.7777}",MGVKLLADGCAGASSSPALSRVATSAAHGEGSPYFAGWKAYDEDPY...,RHEA:21744
15,A0A0P0UZP7,RHEA:21744;-,"{'RHEA:21744': 0.979187, '-': 0.7777}",MIMSGFHIGIYTSICLYIPLPHLEPWIISSHTPKNLNLLDCLLYCS...,RHEA:21744
16,Q9SNN8,RHEA:22104;-;RHEA:21744,"{'RHEA:22104': 0.980896, '-': 0.980291, 'RHEA:...",MRRSGNGGAAKKKKKRSASAASERRPRADGGMRIVVPLQGVVQGRG...,RHEA:21744
17,L0TBY6,-;RHEA:20049,"{'-': 0.99999, 'RHEA:20049': 0.989305}",MTHREELLPPMKWDAWGDPAAAKPLSDGVRSLLKQVVGLADSEQPE...,-
18,M5AWY0,RHEA:34415;-;RHEA:56596,"{'RHEA:34415': 0.999452, '-': 0.7777, 'RHEA:56...",MQAGFFHTPYNLPTRTARQMFDWSLKLAQVCDEAGFADFMIGEHST...,RHEA:34415
20,Q88FI7,RHEA:18049;RHEA:23352;RHEA:10212;RHEA:12268,"{'RHEA:18049': 0.99144, 'RHEA:23352': 0.988573...",MNQESISQSIAIVHPITLSHGRNAEVWDTDGKRYIDFVGGIGVLNL...,RHEA:12601;RHEA:10212
21,A5CAL1,RHEA:10992;RHEA:17905;RHEA:18657;RHEA:57408;RH...,"{'RHEA:10992': 0.998241, 'RHEA:17905': 0.99824...",MESIGVLLTCPMNPYLEQELDKRFKLFRFWDFPSANDLFREHSNSI...,RHEA:16369;RHEA:18657;RHEA:18229;RHEA:10992


In [247]:
ix = 16
str_query = make_make_query(uniprot_id=res_ecrecer.iloc[ix].uniprot_id, seq=res_ecrecer.iloc[ix].seq, rxn_id=res_ecrecer.iloc[ix].RXNRECer, rxn_bank=rxn, prompt=prompt5)

print(str_query)



You are a biochemical expert. Your task is to analyze a protein and determine which reaction(s) it is most likely to catalyze based on the provided data in JSON format.

Guidline:
1. **Literature Evidence Triangulation** ★最高优先级
   a. UniProt/Sequence-based search:
ightarrow 	ext{ReactID} 	ext{ THEN confidence} += 0.4 $
   b. Reaction-centric search:
     $$ 	ext{Match(Equation|ChEBI|SMILES) → PMIDs → Enzyme } \cap 	ext{Input} $$
   c. EC number validation:
     $$ 	ext{Validate EC}_{obs} = rac{	ext{PMID}_{	ext{EC-match}}}{	ext{PMID}_{	ext{EC-total}}} \geq 0.6 $$

2. **Structural Catalytic Competence**
   - SMILES拓扑指纹匹配:
     $$ 	ext{Tanimoto} \geq 0.85 	ext{ with Catalophore model} $$
   - 电荷互补性验证:
     $$ \Delta E_{	ext{elec}} = \sum rac{q_i q_j}{4\pi\epsilon r_{ij}} \leq -15 	ext{ kJ/mol} $$

3. **Quantum Transition State Analysis**
   - 催化效率计算:
     $$ \ln(k_{	ext{cat}}/K_m) \propto -\Delta G^\ddagger_{	ext{calc}} + 	ext{文献修正项} $$
   - 轨道相互作用验证:
     $$ 	ext{HOMO-LUMO gap匹配度} \ge

In [136]:
len(str_query)

0

In [186]:
cli1 = llmchat.Chat(name='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', 
               url='http://172.16.8.75:8123/v1', 
               api_key = 'bio12345'
               )

cli2 = llmchat.Chat(name='DeepSeek-R1-Distill-Llama-70B', 
               url='http://172.16.8.76:8123/v1', 
               api_key = 'bio12345'
                )

cli3 = llmchat.Chat(name='deepseek-r1:70b', 
               url='http://bdcop1.tibhpc.net:11434/v1', 
               api_key = 'bio12345'
)

In [171]:
cli1.chat(message=str_query)

ChatCompletion(id='1959451ab5134958b2e3bf4b3d8677fd', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Okay, so I\'m trying to figure out which reactions the given protein sequence can catalyze. The protein has the UniProt ID Q9SNN8, and its sequence is provided. There are three candidate reactions, but one of them is just a dash, meaning the protein can\'t catalyze that one. So I need to focus on the other two reactions.\n\nFirst, I\'ll look at Reaction 1, which has the ID RHEA:22104. The reaction equation is 2-oxoglutarate converting to 3\'\'-deamino-3\'\'-oxonicotianamine and L-glutamate. The EC number is 2.6.1.80, which I think is a transaminase. Transaminases usually have specific active sites with certain residues like lysine, which is often involved in the catalysis. Looking at the protein sequence, I should check for conserved domains or motifs that match transaminases. Maybe there\'s a PLP (pyridoxal phosphate) binding site s

ChatCompletion(id='1959451ab5134958b2e3bf4b3d8677fd', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Okay, so I\'m trying to figure out which reactions the given protein sequence can catalyze. The protein has the UniProt ID Q9SNN8, and its sequence is provided. There are three candidate reactions, but one of them is just a dash, meaning the protein can\'t catalyze that one. So I need to focus on the other two reactions.\n\nFirst, I\'ll look at Reaction 1, which has the ID RHEA:22104. The reaction equation is 2-oxoglutarate converting to 3\'\'-deamino-3\'\'-oxonicotianamine and L-glutamate. The EC number is 2.6.1.80, which I think is a transaminase. Transaminases usually have specific active sites with certain residues like lysine, which is often involved in the catalysis. Looking at the protein sequence, I should check for conserved domains or motifs that match transaminases. Maybe there\'s a PLP (pyridoxal phosphate) binding site s

In [172]:
cli2.chat(message= str_query)

ChatCompletion(id='c8e4df94e0194ff4a9cb32d317024a23', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Okay, so I\'m trying to figure out which reactions the protein Q9SNN8 can catalyze. Let me start by looking at the information provided.\n\nFirst, the protein sequence is given. I know that certain motifs or domains in the sequence can indicate what kind of reactions the protein might catalyze. I\'ll need to analyze the sequence for any conserved domains or active site residues that are characteristic of specific enzymes.\n\nLooking at the reaction information, there are three candidate reactions. The second one is a dash, which means the protein can\'t catalyze that reaction. So I can ignore that one.\n\nNow, focusing on the first and third reactions. Reaction 1 is RHEA:22104, which involves 2-oxoglutarate and nicotianamine. The enzyme commission number is EC:2.6.1.80. I remember that EC numbers can give clues about the type of reac

ChatCompletion(id='c8e4df94e0194ff4a9cb32d317024a23', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Okay, so I\'m trying to figure out which reactions the protein Q9SNN8 can catalyze. Let me start by looking at the information provided.\n\nFirst, the protein sequence is given. I know that certain motifs or domains in the sequence can indicate what kind of reactions the protein might catalyze. I\'ll need to analyze the sequence for any conserved domains or active site residues that are characteristic of specific enzymes.\n\nLooking at the reaction information, there are three candidate reactions. The second one is a dash, which means the protein can\'t catalyze that reaction. So I can ignore that one.\n\nNow, focusing on the first and third reactions. Reaction 1 is RHEA:22104, which involves 2-oxoglutarate and nicotianamine. The enzyme commission number is EC:2.6.1.80. I remember that EC numbers can give clues about the type of reac

In [223]:
res_cli3 = cli3.chat(message=str_query)

In [217]:
res_cli3

ChatCompletion(id='chatcmpl-327', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='<think>\nOkay, so I\'m trying to figure out which reactions the given protein might catalyze. Let\'s start by looking at the information provided.\n\nThe protein has a UniProt ID of Q9SNN8 and a pretty long amino acid sequence. The task is to analyze this sequence against three candidate reactions. Two of them have valid IDs, RHEA:22104 and RHEA:21744, while the third one is just a dash, meaning it\'s not catalyzed.\n\nFirst, I\'ll look at Reaction 1 (RHEA:22104). The reaction equation involves 2-oxoglutarate and nicotianamine producing 3\'\'-deamino-3\'\'-oxonicotianamine and L-glutamate. The EC number is 2.6.1.80, which I think corresponds to an aminotransferase enzyme. These enzymes usually have specific active site residues like lysine, which is often involved in the catalysis.\n\nLooking at the protein sequence, I notice several conserved domains a

In [227]:
print(res_cli3.choices[0].message.content)

<think>
Okay, so I'm trying to figure out which reactions the given protein might catalyze. Let's start by looking at the information provided.

The protein has a UniProt ID of Q9SNN8 and a pretty long amino acid sequence. The task is to analyze this sequence against three candidate reactions. Two of them have valid IDs, RHEA:22104 and RHEA:21744, while the third one is just a dash, meaning it's not catalyzed.

First, I'll look at Reaction 1 (RHEA:22104). The reaction equation involves 2-oxoglutarate and nicotianamine producing 3''-deamino-3''-oxonicotianamine and L-glutamate. The EC number is 2.6.1.80, which I think corresponds to an aminotransferase enzyme. These enzymes usually have specific active site residues like lysine, which is often involved in the catalysis.

Looking at the protein sequence, I notice several conserved domains and motifs. There's a stretch with "GVRGVLISNPNNPTGSFVPKQTLHDLLEFAAEKNIHLISDEVFAGSTYGSGKFVSVAEVVDDLEDFDKGRVHIIYGLSKDLSLAGFRVGVIYSYNESIVTAAAKIARFSSVSTPT

In [226]:
print(str_query)


You are a biochemical expert. Given a protein sequence and a set of potential catalyzed reactions, please analyze and determine the most likely catalyzed reactions. 
    The information is provided in JSON format.
    Important note: 
    1. If a reaction is represented by a single dash ("-"), it means the protein cannot catalyze a reaction. 
    Please consider this in your analysis. If all candidate reactions are marked with a dash, or if your analysis suggests that none of the reactions with valid IDs are likely catalyzed by the protein, then include the dash candidate reaction in the output to indicate that the protein is not catalytically active for any provided reaction.
    2. Proteins may have multiple active sites. If applicable, select more than one reaction.
    3. Provide a brief explanation for each selection, citing relevant sequence features (e.g., conserved domains, active site residues, structural motifs) and reaction details.

Please give out the finale output in jso