# AAAIM Evaluation Test - ChEBI

This notebook tests both single model evaluation and batch evaluation of multiple models.

In [25]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging
import warnings
warnings.filterwarnings('ignore')

# Add the project root to the Python path
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

# Import AAAIM functions
from core import annotate_model, curate_model
from utils.evaluation import (
    evaluate_single_model,
    evaluate_models_in_folder,
    print_evaluation_results,
    compare_results,
    process_saved_llm_responses
)

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

## Configuration

Set up paths and parameters for evaluation.

In [2]:
# LLM configuration
# llm_model = "meta-llama/llama-3.3-70b-instruct:free"  # or "gpt-4o-mini"
llm_model = "Llama-3.3-70B-Instruct"

# Evaluation parameters
max_entities_per_model = 10  # Limit entities per model for testing
num_models_to_test = 5  # Number of models to test in batch evaluation

# Entity and database configuration
entity_type = "chemical"
database = "chebi"

output_dir = "./results/"  # Output directory for results

## Annotating a new model with no or few existing annotations

In [5]:
test_model_file = "190_few_anno.xml"
# Check if test model exists
if os.path.exists(test_model_file):
    print(f"✓ Test model found: {test_model_file}")
else:
    print(f"✗ Test model not found: {test_model_file}")

✓ Test model found: 190_few_anno.xml


In [6]:
# Test with a single model
recommendations_df, metrics = annotate_model(
    model_file=test_model_file,
    llm_model=llm_model,
    max_entities=max_entities_per_model,
    entity_type=entity_type,
    database=database
)

INFO Starting annotation for model: 190_few_anno.xml
INFO Using LLM model: Llama-3.3-70B-Instruct
INFO Using method: direct for database search
INFO Entity type: chemical, Database: chebi
INFO >>>Step 1: Getting species from model...<<<
INFO Found 11 species in model
INFO Found 2 entities with existing annotations
INFO Selected 10 entities for annotation
INFO >>>Step 2: Extracting model context...<<<
INFO Extracted context for model: Rodriguez-Caso2006_Polyamine_Metabolism
INFO >>>Step 3: Querying LLM (Llama-3.3-70B-Instruct)...<<<
INFO HTTP Request: POST https://api.llama.com/compat/v1/chat/completions "HTTP/1.1 200 OK"
INFO LLM response received in 3.65s
INFO Parsed synonyms for 10 entities
INFO >>>Step 4: Searching chebi database...<<<
INFO Database search completed in 1.49s
INFO >>>Step 5: Generating recommendation table...<<<
INFO Annotation completed in 5.29s
INFO Generated 32 recommendations


In [7]:
recommendations_df

Unnamed: 0,file,type,id,display_name,annotation,annotation_label,match_score,existing,update_annotation
0,190_few_anno.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:15414,S-adenosyl-L-methionine,1.0,1,keep
1,190_few_anno.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:33442,(S)-S-adenosyl-L-methionine,0.333333,0,ignore
2,190_few_anno.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:67040,S-adenosyl-L-methioninate,0.666667,0,ignore
3,190_few_anno.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:59789,S-adenosyl-L-methionine zwitterion,0.333333,0,ignore
4,190_few_anno.xml,chemical,A,S-adenosylmethioninamine,CHEBI:15625,S-adenosylmethioninamine,1.0,1,keep
5,190_few_anno.xml,chemical,P,Putrescine,CHEBI:17148,putrescine,1.0,0,add
6,190_few_anno.xml,chemical,P,Putrescine,CHEBI:326268,"1,4-butanediammonium",0.333333,0,ignore
7,190_few_anno.xml,chemical,S,Spermine,CHEBI:15746,spermine,0.333333,0,ignore
8,190_few_anno.xml,chemical,S,Spermine,CHEBI:45725,spermine(4+),0.666667,0,ignore
9,190_few_anno.xml,chemical,S,Spermine,CHEBI:45718,"3,3,3-tetramine",0.333333,0,ignore


In [8]:
metrics

{'total_entities': 11,
 'entities_with_predictions': 10,
 'annotation_rate': 0.9090909090909091,
 'total_predictions': 32,
 'matches': 2,
 'accuracy': 1.0,
 'total_time': 5.28789496421814,
 'llm_time': 3.648242950439453,
 'search_time': 1.4888739585876465}

## Curate a model with existing annotations

Evaluation of a single model with existing annotations. 
Will only look at the species with existing annotations.

In [9]:
test_model_file = "test_models/BIOMD0000000190.xml"
# Check if test model exists
if os.path.exists(test_model_file):
    print(f"✓ Test model found: {test_model_file}")
else:
    print(f"✗ Test model not found: {test_model_file}")

✓ Test model found: test_models/BIOMD0000000190.xml


In [10]:
# Test with a single model
recommendations_df, metrics = curate_model(
    model_file=test_model_file,
    llm_model=llm_model,
    max_entities=max_entities_per_model,
    entity_type=entity_type,
    database=database
)

INFO Starting curation for model: test_models/BIOMD0000000190.xml
INFO Using LLM model: Llama-3.3-70B-Instruct
INFO Using method: direct for database search
INFO Entity type: chemical, Database: chebi
INFO >>>Step 1: Finding existing annotations...<<<
INFO Found 11 entities with existing annotations
INFO Selected 10 entities for curation
INFO >>>Step 2: Extracting model context...<<<
INFO Extracted context for model: Rodriguez-Caso2006_Polyamine_Metabolism
INFO >>>Step 3: Querying LLM ({llm_model})...<<<
INFO HTTP Request: POST https://api.llama.com/compat/v1/chat/completions "HTTP/1.1 200 OK"
INFO LLM response received in 3.40s
INFO Parsed synonyms for 10 entities
INFO >>>Step 4: Searching chebi database...<<<
INFO Database search completed in 0.68s
INFO >>>Step 5: Generating recommendation table...<<<
INFO Curation completed in 4.20s
INFO Generated 33 recommendations


In [11]:
metrics

{'total_entities': 11,
 'entities_with_predictions': 11,
 'annotation_rate': 1.0,
 'total_predictions': 33,
 'matches': 11,
 'accuracy': 1.0,
 'total_time': 4.200316905975342,
 'llm_time': 3.3968307971954346,
 'search_time': 0.6792628765106201}

In [12]:
recommendations_df

Unnamed: 0,file,type,id,display_name,annotation,annotation_label,match_score,existing,update_annotation
0,BIOMD0000000190.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:15414,S-adenosyl-L-methionine,1.0,1,keep
1,BIOMD0000000190.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:33442,(S)-S-adenosyl-L-methionine,0.333333,0,ignore
2,BIOMD0000000190.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:67040,S-adenosyl-L-methioninate,0.666667,0,ignore
3,BIOMD0000000190.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:59789,S-adenosyl-L-methionine zwitterion,0.333333,0,ignore
4,BIOMD0000000190.xml,chemical,A,S-adenosylmethioninamine,CHEBI:15625,S-adenosylmethioninamine,1.0,1,keep
5,BIOMD0000000190.xml,chemical,P,Putrescine,CHEBI:17148,putrescine,1.0,1,keep
6,BIOMD0000000190.xml,chemical,P,Putrescine,CHEBI:326268,"1,4-butanediammonium",0.333333,0,ignore
7,BIOMD0000000190.xml,chemical,S,Spermine,CHEBI:15746,spermine,0.666667,1,keep
8,BIOMD0000000190.xml,chemical,S,Spermine,CHEBI:45725,spermine(4+),0.666667,0,ignore
9,BIOMD0000000190.xml,chemical,S,Spermine,CHEBI:83547,N(4)-aminopropylspermidine,0.333333,0,ignore


## Test 1: Single Model Evaluation

Evaluation of a single model with existing annotations.

In [11]:
# Test using utils evaluation function
result_df = evaluate_single_model(
    model_file=test_model_file,
    llm_model=llm_model,
    max_entities=max_entities_per_model,
    entity_type=entity_type,
    database=database,
    save_llm_results=True,
    output_dir=output_dir
)

2025-05-27 12:36:28,329 - INFO - Evaluating model: BIOMD0000000190.xml
2025-05-27 12:36:28,341 - INFO - Evaluating 10 entities in BIOMD0000000190.xml
2025-05-27 12:36:30,121 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


LLM results saved to: results/llama-3.3-70b-instruct/BIOMD0000000190_llm_results.txt


In [12]:
result_df

Unnamed: 0,model,species_id,display_name,synonyms_LLM,reason,exist_annotation_chebi,exist_annotation_name,predictions,predictions_names,match_score,recall_formula,precision_formula,recall_chebi,precision_chebi,accuracy,total_time,llm_time,query_time
0,BIOMD0000000190.xml,SAM,S-adenosylmethionine,"[S-adenosylmethionine, AdoMet, SAMe]",The species were identified based on their dis...,[CHEBI:15414],S-adenosyl-L-methionine,"[CHEBI:15414, CHEBI:33442, CHEBI:67040]","[S-adenosyl-L-methionine, (S)-S-adenosyl-L-met...","[1.0, 0.3333333333333333, 0.6666666666666666]",1.0,1.0,1.0,0.333,1,18.389899,17.696673,0.693226
1,BIOMD0000000190.xml,A,decarboxylated S-adenosylmethionine,"[decarboxylated S-adenosylmethionine, S-adenos...",The species were identified based on their dis...,[CHEBI:15625],S-adenosylmethioninamine,[CHEBI:15625],[S-adenosylmethioninamine],[1.0],1.0,1.0,1.0,1.0,1,18.389899,17.696673,0.693226
2,BIOMD0000000190.xml,P,putrescine,"[putrescine, 1,4-diaminobutane, butane-1,4-dia...",The species were identified based on their dis...,[CHEBI:17148],putrescine,"[CHEBI:17148, CHEBI:326268]","[putrescine, 1,4-butanediammonium]","[1.0, 0.3333333333333333]",1.0,1.0,1.0,0.5,1,18.389899,17.696673,0.693226
3,BIOMD0000000190.xml,S,spermine,"[spermine, N,N'-bis(3-aminopropyl)propane-1,3-...",The species were identified based on their dis...,[CHEBI:15746],spermine,"[CHEBI:15746, CHEBI:45725, CHEBI:45718]","[spermine, spermine(4+), 3,3,3-tetramine]","[0.3333333333333333, 0.3333333333333333, 0.333...",1.0,0.5,1.0,0.333,1,18.389899,17.696673,0.693226
4,BIOMD0000000190.xml,D,spermidine,"[spermidine, N-(3-aminopropyl)propane-1,3-diam...",The species were identified based on their dis...,[CHEBI:16610],spermidine,"[CHEBI:16610, CHEBI:57834, CHEBI:16841]","[spermidine, spermidine(3+), bis(3-aminopropyl...","[0.3333333333333333, 0.3333333333333333, 0.333...",1.0,0.5,1.0,0.333,1,18.389899,17.696673,0.693226
5,BIOMD0000000190.xml,aS,N1-acetylspermine,"[N1-acetylspermine, N-acetylspermine, AcSpm]",The species were identified based on their dis...,[CHEBI:17312],N(1)-acetylspermine,"[CHEBI:17312, CHEBI:58101]","[N(1)-acetylspermine, N(1)-acetylsperminium(3+)]","[0.3333333333333333, 0.3333333333333333]",1.0,1.0,1.0,0.5,1,18.389899,17.696673,0.693226
6,BIOMD0000000190.xml,aD,N1-acetylspermidine,"[N1-acetylspermidine, N-acetylspermidine, AcSpd]",The species were identified based on their dis...,[CHEBI:22204],acetylspermidine,"[CHEBI:17927, CHEBI:58324]","[N(1)-acetylspermidine, N(1)-acetylspermidiniu...","[0.3333333333333333, 0.3333333333333333]",1.0,1.0,0.0,0.0,1,18.389899,17.696673,0.693226
7,BIOMD0000000190.xml,Met,methionine,"[methionine, L-methionine, 2-amino-4-(methylth...",The species were identified based on their dis...,[CHEBI:16643],L-methionine,"[CHEBI:16643, CHEBI:16811, CHEBI:64558, CHEBI:...","[L-methionine, methionine, methionine zwitteri...","[0.6666666666666666, 0.6666666666666666, 0.333...",1.0,0.5,1.0,0.143,1,18.389899,17.696673,0.693226
8,BIOMD0000000190.xml,ORN,ornithine,"[ornithine, L-ornithine, 2,5-diaminopentanoic ...",The species were identified based on their dis...,[CHEBI:15729],L-ornithine,"[CHEBI:18257, CHEBI:44667, CHEBI:15729, CHEBI:...","[ornithine, L-ornithinium(2+), L-ornithine, L-...","[0.6666666666666666, 0.3333333333333333, 0.333...",1.0,1.0,1.0,0.25,1,18.389899,17.696673,0.693226
9,BIOMD0000000190.xml,AcCoA,acetyl-CoA,"[acetyl-CoA, acetyl coenzyme A, CoA-SH]",The species were identified based on their dis...,[CHEBI:15351],acetyl-CoA,"[CHEBI:15351, CHEBI:57288, CHEBI:15346]","[acetyl-CoA, acetyl-CoA(4-), coenzyme A]","[0.6666666666666666, 0.3333333333333333, 0.333...",1.0,0.5,1.0,0.333,1,18.389899,17.696673,0.693226


## Test 2: Batch Model Evaluation

Test the evaluation of multiple models in a directory.

In [17]:
model_dir = "/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels"
# model_dir = "test_models"
# Check if model directory exists
if os.path.exists(model_dir):
    model_files = [f for f in os.listdir(model_dir) if f.endswith('.xml')]
    print(f"✓ Model directory found: {model_dir}")
    print(f"  - Found {len(model_files)} XML files")
    # print(f"  - Will test first {min(num_models_to_test, len(model_files))} models")

✓ Model directory found: /Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels
  - Found 1063 XML files


In [19]:
# Run batch evaluation 
batch_results_df = evaluate_models_in_folder(
    model_dir=model_dir,
    llm_model=llm_model,
    entity_type=entity_type,
    database=database,
    method="direct",
    top_k = 1,
    save_llm_results=True,
    output_dir=output_dir,
    output_file="biomd_chebi_direct_llama-3_top1_prompt_adjusted.csv",
    start_at=1
)

LLM results will be saved to: ./results/Llama-3.3-70B-instruct-Meta/chemical/20250710_0031
Evaluating 1/1063: BIOMD0000000001.xml
Evaluating 2/1063: BIOMD0000000002.xml
LLM results saved to: results/Llama-3.3-70B-instruct-Meta/chemical/20250710_0031/BIOMD0000000002.txt
Evaluating 3/1063: BIOMD0000000003.xml
Evaluating 4/1063: BIOMD0000000004.xml
Evaluating 5/1063: BIOMD0000000005.xml
Evaluating 6/1063: BIOMD0000000006.xml
Evaluating 7/1063: BIOMD0000000007.xml
Evaluating 8/1063: BIOMD0000000008.xml
Evaluating 9/1063: BIOMD0000000009.xml
Evaluating 10/1063: BIOMD0000000010.xml
Evaluating 11/1063: BIOMD0000000011.xml
Evaluating 12/1063: BIOMD0000000012.xml
Evaluating 13/1063: BIOMD0000000013.xml
LLM results saved to: results/Llama-3.3-70B-instruct-Meta/chemical/20250710_0031/BIOMD0000000013.txt
Evaluating 14/1063: BIOMD0000000014.xml
Evaluating 15/1063: BIOMD0000000015.xml
LLM results saved to: results/Llama-3.3-70B-instruct-Meta/chemical/20250710_0031/BIOMD0000000015.txt
Evaluating 16/1

In [6]:
# Run batch evaluation 
batch_results_df = evaluate_models_in_folder(
    model_dir=model_dir,
    # num_models=min(num_models_to_test, len(model_files)),
    llm_model=llm_model,
    # max_entities=max_entities_per_model,
    entity_type=entity_type,
    database=database,
    save_llm_results=True,
    output_dir=output_dir,
    output_file="batch_evaluation_results.csv",
    start_at=1
)
        
if not batch_results_df.empty:
    print(f"✓ Batch evaluation successful")
    print(f"  - Evaluated {batch_results_df['model'].nunique()} models")
    print(f"  - Generated {len(batch_results_df)} total result rows")
    print(f"  - Average accuracy: {batch_results_df['accuracy'].mean():.1%}")
    
    # Show updated metrics
    print("\n  Updated metrics summary:")
    if 'recall_formula' in batch_results_df.columns:
        print(f"  - Average recall (formula): {batch_results_df['recall_formula'].mean():.3f}")
        print(f"  - Average precision (formula): {batch_results_df['precision_formula'].mean():.3f}")
    print(f"  - Average recall (ChEBI): {batch_results_df['recall_chebi'].mean():.3f}")
    print(f"  - Average precision (ChEBI): {batch_results_df['precision_chebi'].mean():.3f}")
    
    # Show sample of LLM results
    print("\n  Sample LLM results:")
    if 'synonyms_LLM' in batch_results_df.columns:
        for idx, row in batch_results_df[['species_id', 'synonyms_LLM']].head(3).iterrows():
            print(f"    {row['species_id']}: {row['synonyms_LLM']}")
    
    # Show match scores instead of predictions_hits
    print("\n  Sample match scores:")
    if 'match_score' in batch_results_df.columns:
        for idx, row in batch_results_df[['species_id', 'match_score']].head(3).iterrows():
            print(f"    {row['species_id']}: {row['match_score']}")
    
    # Print summary statistics
    print("\n  Summary statistics:")
    print_evaluation_results(os.path.join(output_dir, "batch_evaluation_results.csv"))
    
else:
    print(f"✗ Batch evaluation failed: No results generated")
            

2025-05-25 16:31:32,475 - INFO - Evaluating 3 models starting from index 1
2025-05-25 16:31:32,476 - INFO - Evaluating 1/3: BIOMD0000000190.xml
2025-05-25 16:31:32,476 - INFO - Evaluating model: BIOMD0000000190.xml
2025-05-25 16:31:32,494 - INFO - Evaluating 10 entities in BIOMD0000000190.xml



2.1 Running batch evaluation
--------------------------------------------------


2025-05-25 16:31:33,726 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-25 16:31:47,641 - INFO - Saved intermediate results to: results/batch_evaluation_results.csv_1.csv
2025-05-25 16:31:47,642 - INFO - Evaluating 2/3: BIOMD0000000508.xml
2025-05-25 16:31:47,642 - INFO - Evaluating model: BIOMD0000000508.xml
2025-05-25 16:31:47,646 - INFO - Evaluating 5 entities in BIOMD0000000508.xml


LLM results saved to: results/llama-3.3-70b-instruct/BIOMD0000000190_llm_results.txt


2025-05-25 16:31:50,066 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-25 16:31:52,557 - INFO - Saved intermediate results to: results/batch_evaluation_results.csv_2.csv
2025-05-25 16:31:52,557 - INFO - Evaluating 3/3: BIOMD0000000634.xml
2025-05-25 16:31:52,557 - INFO - Evaluating model: BIOMD0000000634.xml
2025-05-25 16:31:52,574 - INFO - Evaluating 5 entities in BIOMD0000000634.xml


LLM results saved to: results/llama-3.3-70b-instruct/BIOMD0000000508_llm_results.txt


2025-05-25 16:31:54,555 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-25 16:32:03,496 - INFO - Saved intermediate results to: results/batch_evaluation_results.csv_3.csv
2025-05-25 16:32:03,498 - INFO - Saved final results to: results/batch_evaluation_results.csv


LLM results saved to: results/llama-3.3-70b-instruct/BIOMD0000000634_llm_results.txt
✓ Batch evaluation successful
  - Evaluated 3 models
  - Generated 20 total result rows
  - Average accuracy: 80.0%

  Updated metrics summary:
  - Average recall (formula): 0.800
  - Average precision (formula): 0.750
  - Average recall (ChEBI): 0.800
  - Average precision (ChEBI): 0.424

  Sample LLM results:
    SAM: ['S-adenosylmethionine', 'AdoMet', 'SAMe']
    A: ['S-adenosylmethioninamine', 'decarboxylated S-adenosylmethionine', 'dcAdoMet']
    P: ['putrescine', '1,4-diaminobutane', 'butane-1,4-diamine']

  Sample match scores:
    SAM: [1.0, 0.3333333333333333, 0.6666666666666666]
    A: [1.0]
    P: [1.0, 0.3333333333333333]

  Summary statistics:
Number of models assessed: 3
Number of models with predictions: 3
Average accuracy (per model): 0.77
Ave. total time (per model): 10.20
Ave. total time (per element, per model): 1.53
Ave. LLM time (per model): 9.76
Ave. LLM time (per element, per mod

## Test 3: Evaluating previous LLM sysnonyms

In [4]:
results_df = process_saved_llm_responses(response_folder = '/Users/luna/Desktop/CRBM/AMAS_proj/AAAIM/tests/results/Llama-3.3-70B-instruct-Meta/chemical_prompt_adjusted', 
                               model_dir = '/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels', 
                               prev_results_csv = 'results/biomd_chebi_rag_llama-4_top3_prompt_adjusted.csv', 
                               method = "direct",
                               top_k = 3,
                               output_dir = './results/', 
                               output_file = 'biomd_chebi_direct_llama-4_top3_prompt_adjusted.csv',
                               verbose = False)

Processing 1/340: BIOMD0000000291.txt
Processing 2/340: BIOMD0000000246.txt
Processing 3/340: BIOMD0000000508.txt
Processing 4/340: BIOMD0000000087.txt
Processing 5/340: BIOMD0000000736.txt
Processing 6/340: BIOMD0000000050.txt
Processing 7/340: BIOMD0000000044.txt
Processing 8/340: BIOMD0000000859.txt
Processing 9/340: BIOMD0000000642.txt
Processing 10/340: BIOMD0000000124.txt
Processing 11/340: BIOMD0000000497.txt
Processing 12/340: BIOMD0000000326.txt
Processing 13/340: BIOMD0000000468.txt
Processing 14/340: BIOMD0000000469.txt
Processing 15/340: BIOMD0000001005.txt
Processing 16/340: BIOMD0000000327.txt
Processing 17/340: BIOMD0000000496.txt
Processing 18/340: BIOMD0000000482.txt
Processing 19/340: BIOMD0000000119.txt
Processing 20/340: BIOMD0000000858.txt
Processing 21/340: BIOMD0000000045.txt
Processing 22/340: BIOMD0000000737.txt
Processing 23/340: BIOMD0000000051.txt
Processing 24/340: BIOMD0000000086.txt
Processing 25/340: BIOMD0000000509.txt
Processing 26/340: BIOMD0000000521

In [10]:
results_df = process_saved_llm_responses(response_folder = '/Users/luna/Desktop/CRBM/AMAS_proj/AAAIM/tests/results/Llama-3.3-70B-instruct-Meta/chemical', 
                               model_dir = '/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels', 
                               prev_results_csv = 'results/biomd_chebi_rag_meta-llama_default.csv', 
                               method = "direct",
                               output_dir = './results/', 
                               output_file = 'biomd_chebi_direct_meta-llama_nosymbols.csv',
                               verbose = False)

Processing 1/340: BIOMD0000000291.txt
Processing 2/340: BIOMD0000000246.txt
Processing 3/340: BIOMD0000000508.txt
Processing 4/340: BIOMD0000000087.txt
Processing 5/340: BIOMD0000000736.txt
Processing 6/340: BIOMD0000000050.txt
Processing 7/340: BIOMD0000000044.txt
Processing 8/340: BIOMD0000000859.txt
Processing 9/340: BIOMD0000000642.txt
Processing 10/340: BIOMD0000000124.txt
Processing 11/340: BIOMD0000000497.txt
Processing 12/340: BIOMD0000000326.txt
Processing 13/340: BIOMD0000000468.txt
Processing 14/340: BIOMD0000000469.txt
Processing 15/340: BIOMD0000001005.txt
Processing 16/340: BIOMD0000000327.txt
Processing 17/340: BIOMD0000000496.txt
Processing 18/340: BIOMD0000000482.txt
Processing 19/340: BIOMD0000000119.txt
Processing 20/340: BIOMD0000000858.txt
Processing 21/340: BIOMD0000000045.txt
Processing 22/340: BIOMD0000000737.txt
Processing 23/340: BIOMD0000000051.txt
Processing 24/340: BIOMD0000000086.txt
Processing 25/340: BIOMD0000000509.txt
Processing 26/340: BIOMD0000000521

# Statistics

In [24]:
print_evaluation_results("results/biomd_chebi_rag_llama-3_top10_prompt_adjusted.csv")

Filtered results to 4922 entries that exist in reference: /Users/luna/Desktop/CRBM/AMAS_proj/Results/biomd_species_accuracy_AMAS.csv
Number of models assessed: 311
Number of models with predictions: 310
Number of annotations evaluated: 4922
Average accuracy (per model): 0.90
Ave. recall (formula): 0.90
Ave. precision (formula): 0.22
Ave. recall (exact): 0.84
Ave. precision (exact): 0.09
Average accuracy (per species): 0.70
Ave. recall (formula, per species): 0.70
Ave. precision (formula, per species): 0.19
Ave. recall (exact, per species): 0.64
Ave. precision (exact, per species): 0.07
Ave. total time (per model): 4.28
Ave. total time (per element, per model): 0.27
Ave. LLM time (per model): 3.89
Ave. LLM time (per element, per model): 0.25
Average number of predictions per species: 9.47


In [None]:
print_evaluation_results(os.path.join(output_dir, "biomd_chebi_direct_llama_plain_nosymbols.csv"))

Number of models assessed: 315
Number of models with predictions: 309
Average accuracy (per model): 0.82
Ave. total time (per model): 17.51
Ave. total time (per element, per model): 0.57
Ave. recall (formula): 0.82
Ave. precision (formula): 0.74
Ave. recall (exact): 0.76
Ave. precision (exact): 0.35
Ave. LLM time (per model): 16.42
Ave. LLM time (per element, per model): 0.54
Average number of predictions per species: 1.14


In [11]:
print_evaluation_results(os.path.join(output_dir, "biomd_chebi_direct_meta-llama_nosymbols.csv"))

Number of models assessed: 340
Number of models with predictions: 324
Average accuracy (per model): 0.70
Ave. total time (per model): 5.60
Ave. total time (per element, per model): 0.18
Ave. recall (formula): 0.69
Ave. precision (formula): 0.61
Ave. recall (exact): 0.00
Ave. precision (exact): 0.00
Ave. LLM time (per model): 4.27
Ave. LLM time (per element, per model): 0.13
Average number of predictions per species: 1.26


In [4]:
print_evaluation_results("results/biomd_chebi_direct_meta-llama_nosymbols.csv")

Filtered results to 9650 entries that exist in reference: /Users/luna/Desktop/CRBM/AMAS_proj/Results/biomd_species_accuracy_AMAS.csv
Number of models assessed: 315
Number of models with predictions: 309
Number of annotations evaluated: 9650
Average accuracy (per model): 0.82
Ave. recall (formula): 0.82
Ave. precision (formula): 0.71
Ave. recall (exact): 0.00
Ave. precision (exact): 0.00
Average accuracy (per species): 0.33
Ave. recall (formula, per species): 0.33
Ave. precision (formula, per species): 0.29
Ave. recall (exact, per species): 0.00
Ave. precision (exact, per species): 0.00
Ave. total time (per model): 5.88
Ave. total time (per element, per model): 0.19
Ave. LLM time (per model): 4.47
Ave. LLM time (per element, per model): 0.15
Average number of predictions per species: 1.36


In [44]:
print_evaluation_results("results/biomd_chebi_direct_llama-4_top3_prompt_adjusted.csv")

Filtered results to 3620 entries that exist in reference: /Users/luna/Desktop/CRBM/AMAS_proj/Results/biomd_species_accuracy_AMAS.csv
Number of models assessed: 313
Number of models with predictions: 307
Number of annotations evaluated: 3620
Average accuracy (per model): 0.84
Ave. recall (formula): 0.84
Ave. precision (formula): 0.77
Ave. recall (exact): 0.75
Ave. precision (exact): 0.37
Average accuracy (per species): 0.82
Ave. recall (formula, per species): 0.82
Ave. precision (formula, per species): 0.75
Ave. recall (exact, per species): 0.67
Ave. precision (exact, per species): 0.33
Ave. total time (per model): 10.90
Ave. total time (per element, per model): 0.94
Ave. LLM time (per model): 9.94
Ave. LLM time (per element, per model): 0.86
Average number of predictions per species: 2.05


# Test functions

In [1]:
import libsbml
model_file = "/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels/BIOMD0000000071.xml"
reader = libsbml.SBMLReader()
document = reader.readSBML(model_file)
model = document.getModel()

In [3]:
model.getSpecies("Nb").getName()

'3-PGA 2-PGA PEP'

In [42]:
model_file="/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels/BIOMD0000000113.xml"
from utils.evaluation import find_species_with_formulas
# extract species that have chebi annotation
existing_annotations = find_species_with_formulas(model_file, ['is','isVersionOf','hasPart','hasProperty'])
existing_annotations

{'Z': ['29108'], 'Y': ['29108']}

In [43]:
# get the prompt for this model
from core.model_info import format_prompt
prompt = format_prompt(model_file, existing_annotations.keys())
print(prompt)

Now annotate these:
Species to annotate: Z, Y
Model: "Dupont1992_Ca_dpt_protein_phospho"
// Display Names:
{'Z': '', 'Y': ''}
// Reactions:
R0: => Z
R1: => Z
R2: Z => Y
R3: Y => Z
Rf: Y => Z
R_eff: Z => 
// Notes:
Model reproduces Fig 4 of the paper. For fraction of phosphorylated protein, W_star, the model reproduces panel b in the same figure. Model successfully tested on MathSBML and Jarnac.
for more information.

Return up to 3 standardized names or common synonyms for each species, ranked by likelihood.
Use the below format, do not include any other text except the synonyms, and give short reasons for all species after 'Reason:' by the end.

SpeciesA: "name1", "name2", …
SpeciesB:  …
Reason: …
        


In [7]:
# Test with a single model
recommendations_df, metrics = annotate_model(
    model_file=model_file,
    llm_model=llm_model,
    entity_type=entity_type,
    database=database
)
recommendations_df

INFO Starting annotation for model: /Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels/BIOMD0000000071.xml
INFO Using LLM model: Llama-3.3-70B-Instruct
INFO Using method: direct for database search
INFO Entity type: chemical, Database: chebi
INFO >>>Step 1: Getting species from model...<<<
INFO Found 27 species in model
INFO Found 27 entities with existing annotations
INFO Annotate all 27 entities
INFO >>>Step 2: Extracting model context...<<<
INFO Extracted context for model: Bakker2001_Glycolysis
INFO >>>Step 3: Querying LLM (Llama-3.3-70B-Instruct)...<<<
INFO HTTP Request: POST https://api.llama.com/compat/v1/chat/completions "HTTP/1.1 200 OK"
INFO LLM response received in 5.13s
INFO Parsed synonyms for 26 entities
INFO >>>Step 4: Searching chebi database...<<<
INFO Database search completed in 2.38s
INFO >>>Step 5: Generating recommendation table...<<<
INFO Annotation completed in 7.68s
INFO Generated 81 recommendations


Unnamed: 0,file,type,id,display_name,annotation,annotation_label,match_score,existing,update_annotation
0,BIOMD0000000071.xml,chemical,GlcI,Glucose,CHEBI:4167,D-glucopyranose,0.666667,0,add
1,BIOMD0000000071.xml,chemical,GlcI,Glucose,CHEBI:42758,aldehydo-D-glucose,0.666667,0,ignore
2,BIOMD0000000071.xml,chemical,GlcI,Glucose,CHEBI:17234,glucose,0.333333,1,keep
3,BIOMD0000000071.xml,chemical,GlcI,Glucose,CHEBI:17634,D-glucose,0.333333,0,ignore
4,BIOMD0000000071.xml,chemical,Pg,Phosphates in Glycosome,CHEBI:18367,phosphate(3-),0.666667,0,add
...,...,...,...,...,...,...,...,...,...
76,BIOMD0000000071.xml,chemical,PGAg,3-PGA g.,CHEBI:17050,3-phosphoglyceric acid,0.666667,0,ignore
77,BIOMD0000000071.xml,chemical,PGAg,3-PGA g.,CHEBI:57998,3-phosphoglycerate(3-),0.333333,0,ignore
78,BIOMD0000000071.xml,chemical,PEPc,PEP c.,CHEBI:18021,phosphoenolpyruvate,0.666667,1,keep
79,BIOMD0000000071.xml,chemical,PEPc,PEP c.,CHEBI:44897,phosphoenolpyruvic acid,1.000000,0,ignore


In [11]:
recommendations_df[recommendations_df['id']=='Nb']

Unnamed: 0,file,type,id,display_name,annotation,annotation_label,match_score,existing,update_annotation
38,BIOMD0000000071.xml,chemical,Nb,3-PGA 2-PGA PEP,CHEBI:132960,3-phosphoglycerate,0.333333,0,ignore
39,BIOMD0000000071.xml,chemical,Nb,3-PGA 2-PGA PEP,CHEBI:17794,3-phospho-D-glyceric acid,0.333333,1,keep
40,BIOMD0000000071.xml,chemical,Nb,3-PGA 2-PGA PEP,CHEBI:17050,3-phosphoglyceric acid,0.666667,0,ignore
41,BIOMD0000000071.xml,chemical,Nb,3-PGA 2-PGA PEP,CHEBI:57998,3-phosphoglycerate(3-),0.333333,0,ignore
