# AAAIM Evaluation Test - ChEBI

This notebook tests both single model evaluation and batch evaluation of multiple models.

In [9]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Add the project root to the Python path
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

# Import AAAIM functions
from core import annotate_model, curate_model
from utils.evaluation import (
    evaluate_single_model,
    evaluate_models_in_folder,
    print_evaluation_results,
    compare_results,
    process_saved_llm_responses
)

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

## Configuration

Set up paths and parameters for evaluation.

In [7]:
# LLM configuration
# llm_model = "meta-llama/llama-3.3-70b-instruct:free"  # or "gpt-4o-mini"
llm_model = "Llama-3.3-70B-Instruct"

# Evaluation parameters
max_entities_per_model = 10  # Limit entities per model for testing
num_models_to_test = 5  # Number of models to test in batch evaluation

# Entity and database configuration
entity_type = "chemical"
database = "chebi"

output_dir = "./results/"  # Output directory for results

## Annotating a new model with no or few existing annotations

In [4]:
test_model_file = "190_few_anno.xml"
# Check if test model exists
if os.path.exists(test_model_file):
    print(f"✓ Test model found: {test_model_file}")
else:
    print(f"✗ Test model not found: {test_model_file}")

✓ Test model found: 190_few_anno.xml


In [5]:
# Test with a single model
recommendations_df, metrics = annotate_model(
    model_file=test_model_file,
    llm_model=llm_model,
    max_entities=max_entities_per_model,
    entity_type=entity_type,
    database=database
)

2025-05-27 16:36:58,372 - INFO - Starting annotation for model: 190_few_anno.xml
2025-05-27 16:36:58,373 - INFO - Using LLM model: meta-llama/llama-3.3-70b-instruct:free
2025-05-27 16:36:58,373 - INFO - Entity type: chemical, Database: chebi
2025-05-27 16:36:58,374 - INFO - Step 1: Getting all species from model...
2025-05-27 16:36:58,381 - INFO - Found 11 species in model
2025-05-27 16:36:58,386 - INFO - Found 2 entities with existing annotations
2025-05-27 16:36:58,386 - INFO - Selected 10 entities for annotation
2025-05-27 16:36:58,386 - INFO - Step 4: Extracting model context...
2025-05-27 16:36:58,430 - INFO - Extracted context for model: Model_1
2025-05-27 16:36:58,430 - INFO - Step 5: Formatting LLM prompt...
2025-05-27 16:36:58,451 - INFO - Step 6: Querying LLM (meta-llama/llama-3.3-70b-instruct:free)...
2025-05-27 16:36:59,944 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-27 16:37:12,594 - INFO - LLM response received in 14

In [6]:
recommendations_df

Unnamed: 0,file,type,id,display_name,annotation,annotation_label,match_score,existing,update_annotation
0,190_few_anno.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:15414,S-adenosyl-L-methionine,1.0,1,keep
1,190_few_anno.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:33442,(S)-S-adenosyl-L-methionine,0.333333,0,ignore
2,190_few_anno.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:67040,S-adenosyl-L-methioninate,0.666667,0,ignore
3,190_few_anno.xml,chemical,A,S-adenosylmethioninamine,CHEBI:15625,S-adenosylmethioninamine,0.666667,1,keep
4,190_few_anno.xml,chemical,P,Putrescine,CHEBI:17148,putrescine,1.0,0,add
5,190_few_anno.xml,chemical,P,Putrescine,CHEBI:326268,"1,4-butanediammonium",0.333333,0,ignore
6,190_few_anno.xml,chemical,S,Spermine,CHEBI:15746,spermine,0.333333,0,ignore
7,190_few_anno.xml,chemical,S,Spermine,CHEBI:45725,spermine(4+),0.333333,0,ignore
8,190_few_anno.xml,chemical,S,Spermine,CHEBI:45718,"3,3,3-tetramine",0.333333,0,ignore
9,190_few_anno.xml,chemical,D,Spermidine,CHEBI:16610,spermidine,0.333333,0,ignore


In [7]:
metrics

{'total_entities': 11,
 'entities_with_predictions': 10,
 'annotation_rate': 0.9090909090909091,
 'total_predictions': 29,
 'matches': 2,
 'accuracy': 1.0,
 'total_time': 15.440874338150024,
 'llm_time': 14.142799139022827,
 'search_time': 1.213942050933838}

## Curate a model with existing annotations

Evaluation of a single model with existing annotations. 
Will only look at the species with existing annotations.

In [7]:
test_model_file = "test_models/BIOMD0000000190.xml"
# Check if test model exists
if os.path.exists(test_model_file):
    print(f"✓ Test model found: {test_model_file}")
else:
    print(f"✗ Test model not found: {test_model_file}")

✓ Test model found: test_models/BIOMD0000000190.xml


In [10]:
# Test with a single model
recommendations_df, metrics = curate_model(
    model_file=test_model_file,
    llm_model=llm_model,
    max_entities=max_entities_per_model,
    entity_type=entity_type,
    database=database
)

2025-05-27 13:48:30,140 - INFO - Starting curation for model: test_models/BIOMD0000000190.xml
2025-05-27 13:48:30,141 - INFO - Using LLM model: meta-llama/llama-3.3-70b-instruct:free
2025-05-27 13:48:30,141 - INFO - Entity type: chemical, Database: chebi
2025-05-27 13:48:30,142 - INFO - Step 1: Finding existing annotations...
2025-05-27 13:48:30,154 - INFO - Found 11 entities with existing annotations
2025-05-27 13:48:30,155 - INFO - Selected 10 entities for evaluation
2025-05-27 13:48:30,155 - INFO - Step 3: Extracting model context...
2025-05-27 13:48:30,205 - INFO - Extracted context for model: Model_1
2025-05-27 13:48:30,205 - INFO - Step 4: Formatting LLM prompt...
2025-05-27 13:48:30,240 - INFO - Step 5: Querying LLM (meta-llama/llama-3.3-70b-instruct:free)...
2025-05-27 13:48:31,393 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-27 13:48:44,868 - INFO - LLM response received in 14.63s
2025-05-27 13:48:44,869 - INFO - Step 6: P

In [11]:
metrics

{'total_entities': 11,
 'entities_with_predictions': 10,
 'annotation_rate': 0.9090909090909091,
 'total_predictions': 28,
 'matches': 9,
 'accuracy': 0.9,
 'total_time': 15.448332786560059,
 'llm_time': 14.626698017120361,
 'search_time': 0.7132561206817627}

In [12]:
recommendations_df

Unnamed: 0,file,type,id,display_name,annotation,annotation_label,match_score,existing,update_annotation
0,BIOMD0000000190.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:15414,S-adenosyl-L-methionine,1.0,1,keep
1,BIOMD0000000190.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:33442,(S)-S-adenosyl-L-methionine,0.333333,0,ignore
2,BIOMD0000000190.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:67040,S-adenosyl-L-methioninate,0.666667,0,ignore
3,BIOMD0000000190.xml,chemical,A,S-adenosylmethioninamine,CHEBI:15625,S-adenosylmethioninamine,1.0,1,keep
4,BIOMD0000000190.xml,chemical,P,Putrescine,CHEBI:17148,putrescine,1.0,1,keep
5,BIOMD0000000190.xml,chemical,P,Putrescine,CHEBI:326268,"1,4-butanediammonium",0.333333,0,ignore
6,BIOMD0000000190.xml,chemical,S,Spermine,CHEBI:15746,spermine,0.333333,1,keep
7,BIOMD0000000190.xml,chemical,S,Spermine,CHEBI:45725,spermine(4+),0.333333,0,ignore
8,BIOMD0000000190.xml,chemical,D,Spermidine,CHEBI:16610,spermidine,0.666667,1,keep
9,BIOMD0000000190.xml,chemical,D,Spermidine,CHEBI:57834,spermidine(3+),0.333333,0,ignore


## Test 1: Single Model Evaluation

Evaluation of a single model with existing annotations.

In [11]:
# Test using utils evaluation function
result_df = evaluate_single_model(
    model_file=test_model_file,
    llm_model=llm_model,
    max_entities=max_entities_per_model,
    entity_type=entity_type,
    database=database,
    save_llm_results=True,
    output_dir=output_dir
)

2025-05-27 12:36:28,329 - INFO - Evaluating model: BIOMD0000000190.xml
2025-05-27 12:36:28,341 - INFO - Evaluating 10 entities in BIOMD0000000190.xml
2025-05-27 12:36:30,121 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


LLM results saved to: results/llama-3.3-70b-instruct/BIOMD0000000190_llm_results.txt


In [12]:
result_df

Unnamed: 0,model,species_id,display_name,synonyms_LLM,reason,exist_annotation_chebi,exist_annotation_name,predictions,predictions_names,match_score,recall_formula,precision_formula,recall_chebi,precision_chebi,accuracy,total_time,llm_time,query_time
0,BIOMD0000000190.xml,SAM,S-adenosylmethionine,"[S-adenosylmethionine, AdoMet, SAMe]",The species were identified based on their dis...,[CHEBI:15414],S-adenosyl-L-methionine,"[CHEBI:15414, CHEBI:33442, CHEBI:67040]","[S-adenosyl-L-methionine, (S)-S-adenosyl-L-met...","[1.0, 0.3333333333333333, 0.6666666666666666]",1.0,1.0,1.0,0.333,1,18.389899,17.696673,0.693226
1,BIOMD0000000190.xml,A,decarboxylated S-adenosylmethionine,"[decarboxylated S-adenosylmethionine, S-adenos...",The species were identified based on their dis...,[CHEBI:15625],S-adenosylmethioninamine,[CHEBI:15625],[S-adenosylmethioninamine],[1.0],1.0,1.0,1.0,1.0,1,18.389899,17.696673,0.693226
2,BIOMD0000000190.xml,P,putrescine,"[putrescine, 1,4-diaminobutane, butane-1,4-dia...",The species were identified based on their dis...,[CHEBI:17148],putrescine,"[CHEBI:17148, CHEBI:326268]","[putrescine, 1,4-butanediammonium]","[1.0, 0.3333333333333333]",1.0,1.0,1.0,0.5,1,18.389899,17.696673,0.693226
3,BIOMD0000000190.xml,S,spermine,"[spermine, N,N'-bis(3-aminopropyl)propane-1,3-...",The species were identified based on their dis...,[CHEBI:15746],spermine,"[CHEBI:15746, CHEBI:45725, CHEBI:45718]","[spermine, spermine(4+), 3,3,3-tetramine]","[0.3333333333333333, 0.3333333333333333, 0.333...",1.0,0.5,1.0,0.333,1,18.389899,17.696673,0.693226
4,BIOMD0000000190.xml,D,spermidine,"[spermidine, N-(3-aminopropyl)propane-1,3-diam...",The species were identified based on their dis...,[CHEBI:16610],spermidine,"[CHEBI:16610, CHEBI:57834, CHEBI:16841]","[spermidine, spermidine(3+), bis(3-aminopropyl...","[0.3333333333333333, 0.3333333333333333, 0.333...",1.0,0.5,1.0,0.333,1,18.389899,17.696673,0.693226
5,BIOMD0000000190.xml,aS,N1-acetylspermine,"[N1-acetylspermine, N-acetylspermine, AcSpm]",The species were identified based on their dis...,[CHEBI:17312],N(1)-acetylspermine,"[CHEBI:17312, CHEBI:58101]","[N(1)-acetylspermine, N(1)-acetylsperminium(3+)]","[0.3333333333333333, 0.3333333333333333]",1.0,1.0,1.0,0.5,1,18.389899,17.696673,0.693226
6,BIOMD0000000190.xml,aD,N1-acetylspermidine,"[N1-acetylspermidine, N-acetylspermidine, AcSpd]",The species were identified based on their dis...,[CHEBI:22204],acetylspermidine,"[CHEBI:17927, CHEBI:58324]","[N(1)-acetylspermidine, N(1)-acetylspermidiniu...","[0.3333333333333333, 0.3333333333333333]",1.0,1.0,0.0,0.0,1,18.389899,17.696673,0.693226
7,BIOMD0000000190.xml,Met,methionine,"[methionine, L-methionine, 2-amino-4-(methylth...",The species were identified based on their dis...,[CHEBI:16643],L-methionine,"[CHEBI:16643, CHEBI:16811, CHEBI:64558, CHEBI:...","[L-methionine, methionine, methionine zwitteri...","[0.6666666666666666, 0.6666666666666666, 0.333...",1.0,0.5,1.0,0.143,1,18.389899,17.696673,0.693226
8,BIOMD0000000190.xml,ORN,ornithine,"[ornithine, L-ornithine, 2,5-diaminopentanoic ...",The species were identified based on their dis...,[CHEBI:15729],L-ornithine,"[CHEBI:18257, CHEBI:44667, CHEBI:15729, CHEBI:...","[ornithine, L-ornithinium(2+), L-ornithine, L-...","[0.6666666666666666, 0.3333333333333333, 0.333...",1.0,1.0,1.0,0.25,1,18.389899,17.696673,0.693226
9,BIOMD0000000190.xml,AcCoA,acetyl-CoA,"[acetyl-CoA, acetyl coenzyme A, CoA-SH]",The species were identified based on their dis...,[CHEBI:15351],acetyl-CoA,"[CHEBI:15351, CHEBI:57288, CHEBI:15346]","[acetyl-CoA, acetyl-CoA(4-), coenzyme A]","[0.6666666666666666, 0.3333333333333333, 0.333...",1.0,0.5,1.0,0.333,1,18.389899,17.696673,0.693226


## Test 2: Batch Model Evaluation

Test the evaluation of multiple models in a directory.

In [13]:
# model_dir = "/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels"
model_dir = "test_models"
# Check if model directory exists
if os.path.exists(model_dir):
    model_files = [f for f in os.listdir(model_dir) if f.endswith('.xml')]
    print(f"✓ Model directory found: {model_dir}")
    print(f"  - Found {len(model_files)} XML files")
    print(f"  - Will test first {min(num_models_to_test, len(model_files))} models")

✓ Model directory found: test_models
  - Found 3 XML files
  - Will test first 3 models


In [6]:
# Run batch evaluation 
batch_results_df = evaluate_models_in_folder(
    model_dir=model_dir,
    # num_models=min(num_models_to_test, len(model_files)),
    llm_model=llm_model,
    # max_entities=max_entities_per_model,
    entity_type=entity_type,
    database=database,
    save_llm_results=True,
    output_dir=output_dir,
    output_file="batch_evaluation_results.csv",
    start_at=1
)
        
if not batch_results_df.empty:
    print(f"✓ Batch evaluation successful")
    print(f"  - Evaluated {batch_results_df['model'].nunique()} models")
    print(f"  - Generated {len(batch_results_df)} total result rows")
    print(f"  - Average accuracy: {batch_results_df['accuracy'].mean():.1%}")
    
    # Show updated metrics
    print("\n  Updated metrics summary:")
    if 'recall_formula' in batch_results_df.columns:
        print(f"  - Average recall (formula): {batch_results_df['recall_formula'].mean():.3f}")
        print(f"  - Average precision (formula): {batch_results_df['precision_formula'].mean():.3f}")
    print(f"  - Average recall (ChEBI): {batch_results_df['recall_chebi'].mean():.3f}")
    print(f"  - Average precision (ChEBI): {batch_results_df['precision_chebi'].mean():.3f}")
    
    # Show sample of LLM results
    print("\n  Sample LLM results:")
    if 'synonyms_LLM' in batch_results_df.columns:
        for idx, row in batch_results_df[['species_id', 'synonyms_LLM']].head(3).iterrows():
            print(f"    {row['species_id']}: {row['synonyms_LLM']}")
    
    # Show match scores instead of predictions_hits
    print("\n  Sample match scores:")
    if 'match_score' in batch_results_df.columns:
        for idx, row in batch_results_df[['species_id', 'match_score']].head(3).iterrows():
            print(f"    {row['species_id']}: {row['match_score']}")
    
    # Print summary statistics
    print("\n  Summary statistics:")
    print_evaluation_results(os.path.join(output_dir, "batch_evaluation_results.csv"))
    
else:
    print(f"✗ Batch evaluation failed: No results generated")
            

2025-05-25 16:31:32,475 - INFO - Evaluating 3 models starting from index 1
2025-05-25 16:31:32,476 - INFO - Evaluating 1/3: BIOMD0000000190.xml
2025-05-25 16:31:32,476 - INFO - Evaluating model: BIOMD0000000190.xml
2025-05-25 16:31:32,494 - INFO - Evaluating 10 entities in BIOMD0000000190.xml



2.1 Running batch evaluation
--------------------------------------------------


2025-05-25 16:31:33,726 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-25 16:31:47,641 - INFO - Saved intermediate results to: results/batch_evaluation_results.csv_1.csv
2025-05-25 16:31:47,642 - INFO - Evaluating 2/3: BIOMD0000000508.xml
2025-05-25 16:31:47,642 - INFO - Evaluating model: BIOMD0000000508.xml
2025-05-25 16:31:47,646 - INFO - Evaluating 5 entities in BIOMD0000000508.xml


LLM results saved to: results/llama-3.3-70b-instruct/BIOMD0000000190_llm_results.txt


2025-05-25 16:31:50,066 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-25 16:31:52,557 - INFO - Saved intermediate results to: results/batch_evaluation_results.csv_2.csv
2025-05-25 16:31:52,557 - INFO - Evaluating 3/3: BIOMD0000000634.xml
2025-05-25 16:31:52,557 - INFO - Evaluating model: BIOMD0000000634.xml
2025-05-25 16:31:52,574 - INFO - Evaluating 5 entities in BIOMD0000000634.xml


LLM results saved to: results/llama-3.3-70b-instruct/BIOMD0000000508_llm_results.txt


2025-05-25 16:31:54,555 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-25 16:32:03,496 - INFO - Saved intermediate results to: results/batch_evaluation_results.csv_3.csv
2025-05-25 16:32:03,498 - INFO - Saved final results to: results/batch_evaluation_results.csv


LLM results saved to: results/llama-3.3-70b-instruct/BIOMD0000000634_llm_results.txt
✓ Batch evaluation successful
  - Evaluated 3 models
  - Generated 20 total result rows
  - Average accuracy: 80.0%

  Updated metrics summary:
  - Average recall (formula): 0.800
  - Average precision (formula): 0.750
  - Average recall (ChEBI): 0.800
  - Average precision (ChEBI): 0.424

  Sample LLM results:
    SAM: ['S-adenosylmethionine', 'AdoMet', 'SAMe']
    A: ['S-adenosylmethioninamine', 'decarboxylated S-adenosylmethionine', 'dcAdoMet']
    P: ['putrescine', '1,4-diaminobutane', 'butane-1,4-diamine']

  Sample match scores:
    SAM: [1.0, 0.3333333333333333, 0.6666666666666666]
    A: [1.0]
    P: [1.0, 0.3333333333333333]

  Summary statistics:
Number of models assessed: 3
Number of models with predictions: 3
Average accuracy (per model): 0.77
Ave. total time (per model): 10.20
Ave. total time (per element, per model): 1.53
Ave. LLM time (per model): 9.76
Ave. LLM time (per element, per mod

## Test 3: Evaluating previous LLM sysnonyms

In [10]:
results_df = process_saved_llm_responses(response_folder = '/Users/luna/Desktop/CRBM/AMAS_proj/AAAIM/tests/results/Llama-3.3-70B-instruct-Meta/chemical', 
                               model_dir = '/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels', 
                               prev_results_csv = 'results/biomd_chebi_rag_meta-llama_default.csv', 
                               method = "direct",
                               output_dir = './results/', 
                               output_file = 'biomd_chebi_direct_meta-llama_nosymbols.csv',
                               verbose = False)

Processing 1/340: BIOMD0000000291.txt
Processing 2/340: BIOMD0000000246.txt
Processing 3/340: BIOMD0000000508.txt
Processing 4/340: BIOMD0000000087.txt
Processing 5/340: BIOMD0000000736.txt
Processing 6/340: BIOMD0000000050.txt
Processing 7/340: BIOMD0000000044.txt
Processing 8/340: BIOMD0000000859.txt
Processing 9/340: BIOMD0000000642.txt
Processing 10/340: BIOMD0000000124.txt
Processing 11/340: BIOMD0000000497.txt
Processing 12/340: BIOMD0000000326.txt
Processing 13/340: BIOMD0000000468.txt
Processing 14/340: BIOMD0000000469.txt
Processing 15/340: BIOMD0000001005.txt
Processing 16/340: BIOMD0000000327.txt
Processing 17/340: BIOMD0000000496.txt
Processing 18/340: BIOMD0000000482.txt
Processing 19/340: BIOMD0000000119.txt
Processing 20/340: BIOMD0000000858.txt
Processing 21/340: BIOMD0000000045.txt
Processing 22/340: BIOMD0000000737.txt
Processing 23/340: BIOMD0000000051.txt
Processing 24/340: BIOMD0000000086.txt
Processing 25/340: BIOMD0000000509.txt
Processing 26/340: BIOMD0000000521

# Statistics

In [None]:
print_evaluation_results(os.path.join(output_dir, "biomd_chebi_direct_llama_plain_nosymbols.csv"))

Number of models assessed: 315
Number of models with predictions: 309
Average accuracy (per model): 0.82
Ave. total time (per model): 17.51
Ave. total time (per element, per model): 0.57
Ave. recall (formula): 0.82
Ave. precision (formula): 0.74
Ave. recall (exact): 0.76
Ave. precision (exact): 0.35
Ave. LLM time (per model): 16.42
Ave. LLM time (per element, per model): 0.54
Average number of predictions per species: 1.14


In [11]:
print_evaluation_results(os.path.join(output_dir, "biomd_chebi_direct_meta-llama_nosymbols.csv"))

Number of models assessed: 340
Number of models with predictions: 324
Average accuracy (per model): 0.70
Ave. total time (per model): 5.60
Ave. total time (per element, per model): 0.18
Ave. recall (formula): 0.69
Ave. precision (formula): 0.61
Ave. recall (exact): 0.00
Ave. precision (exact): 0.00
Ave. LLM time (per model): 4.27
Ave. LLM time (per element, per model): 0.13
Average number of predictions per species: 1.26
