# AAAIM Evaluation Test

This notebook tests both single model evaluation and batch evaluation of multiple models.

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Add the project root to the Python path
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

# Import AAAIM functions
from core import annotate_model, curate_model
from core.database_search import force_clear_chromadb, get_species_recommendations_rag
from utils.evaluation import (
    evaluate_single_model,
    evaluate_models_in_folder,
    print_evaluation_results,
    compare_results,
    process_saved_llm_responses
)

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# LLM configuration
llm_model = "meta-llama/llama-3.3-70b-instruct:free"
# llm_model = "meta-llama/llama-3.3-70b-instruct"
# llm_model = "gpt-4.1-nano"

# Evaluation parameters
max_entities_per_model = 10  # Limit entities per model for testing
num_models_to_test = 5  # Number of models to test in batch evaluation

# Entity and database configuration
entity_type = "chemical"
database = "chebi"

output_dir = "./results/"  # Output directory for results

### Test function

In [2]:
# Test data - typical chemical species with synonyms
species_ids = ["glucose", "caffeine", "aspirin"]
synonyms_dict = {
    "glucose": ["glucose", "dextrose", "D-glucose"],
    "caffeine": ["caffeine", "1,3,7-trimethylxanthine"],
    "aspirin": ["aspirin", "acetylsalicylic acid", "ASA"]
}

print("Testing RAG-based entity linking...")
print("="*50)

try:
    # Test RAG approach
    rag_recommendations = get_species_recommendations_rag(
        species_ids=species_ids,
        synonyms_dict=synonyms_dict,
        model_type="default",
        top_k=3
    )
    
    for rec in rag_recommendations:
        print(f"\nSpecies: {rec.id}")
        print(f"Synonyms: {rec.synonyms}")
        print(f"Candidates: {rec.candidates}")
        print(f"Names: {rec.candidate_names}")
        print(f"Match scores (similarity): {rec.match_score}")
        
except Exception as e:
    print(f"RAG search failed: {e}")

Testing RAG-based entity linking...
INFO Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO Using sentence transformer all-MiniLM-L6-v2 model
INFO Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO Using RAG embeddings from collection 'chebi_default_numonly' with default model


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Species: glucose
Synonyms: ['glucose', 'dextrose', 'D-glucose']
Candidates: ['4167', '42758', '17234']
Names: ['glucose', 'glucose', 'glucose']
Match scores (similarity): [1.0, 1.0, 1.0]

Species: caffeine
Synonyms: ['caffeine', '1,3,7-trimethylxanthine']
Candidates: ['27732', '17395', '18123']
Names: ['caffeine', 'caffeate', 'caffearine']
Match scores (similarity): [1.0, 0.745, 0.687]

Species: aspirin
Synonyms: ['aspirin', 'acetylsalicylic acid', 'ASA']
Candidates: ['15365', '182973', '233566']
Names: ['aspirin', 'no-aspirin', 'aspirin softam']
Match scores (similarity): [1.0, 0.856, 0.716]


## Annotating a new model with no or few existing annotations

In [2]:
test_model_file = "190_few_anno.xml"
# Check if test model exists
if os.path.exists(test_model_file):
    print(f"✓ Test model found: {test_model_file}")
else:
    print(f"✗ Test model not found: {test_model_file}")

✓ Test model found: 190_few_anno.xml


In [3]:
# Test with a single model
recommendations_df, metrics = annotate_model(
    model_file=test_model_file,
    llm_model=llm_model,
    method="rag",
    max_entities=max_entities_per_model,
    entity_type=entity_type,
    database=database
)

INFO Starting annotation for model: 190_few_anno.xml
INFO Using LLM model: meta-llama/llama-3.3-70b-instruct:free
INFO Using method: rag for database search
INFO Entity type: chemical, Database: chebi
INFO Step 1: Getting all species from model...
INFO Found 11 species in model
INFO Found 2 entities with existing annotations
INFO Selected 10 entities for annotation
INFO Step 4: Extracting model context...
INFO Extracted context for model: Rodriguez-Caso2006_Polyamine_Metabolism
INFO Step 5: Formatting LLM prompt...
INFO Step 6: Querying LLM (meta-llama/llama-3.3-70b-instruct:free)...
INFO HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO LLM response received in 5.69s
INFO Step 7: Parsing LLM response...
INFO Parsed synonyms for 11 entities
INFO Step 8: Searching chebi database...
INFO Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO Using sentence transformer all-MiniLM-L6-v2 mod

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO Database search completed in 6.39s
INFO Step 9: Generating recommendation table...
INFO Annotation completed in 12.17s
INFO Generated 50 recommendations


In [4]:
recommendations_df

Unnamed: 0,file,type,id,display_name,annotation,annotation_label,match_score,existing,update_annotation
0,190_few_anno.xml,chemical,SAM,S-adenosyl-L-methionine,15414,s-adenosylmethionine,1.0,1,keep
1,190_few_anno.xml,chemical,SAM,S-adenosyl-L-methionine,67040,s-adenosylmethionine,1.0,0,ignore
2,190_few_anno.xml,chemical,SAM,S-adenosyl-L-methionine,33442,s-adenosylmethionine,1.0,0,ignore
3,190_few_anno.xml,chemical,SAM,S-adenosyl-L-methionine,59789,s-adenosylmethioninamine,1.0,0,ignore
4,190_few_anno.xml,chemical,SAM,S-adenosyl-L-methionine,142094,s-adenosylmethioninaminium,0.935,0,ignore
5,190_few_anno.xml,chemical,A,S-adenosylmethioninamine,15625,decarboxylated s-adenosylmethionine,1.0,1,keep
6,190_few_anno.xml,chemical,A,S-adenosylmethioninamine,67040,decarboxylated s-adenosylmethionine(2+),0.926,0,ignore
7,190_few_anno.xml,chemical,A,S-adenosylmethioninamine,33442,s-adenosylmethionine decarboxylase inhibitor,0.926,0,ignore
8,190_few_anno.xml,chemical,A,S-adenosylmethioninamine,15414,s-adenosylmethionine,0.926,0,ignore
9,190_few_anno.xml,chemical,A,S-adenosylmethioninamine,57443,s-adenosylmethionine,0.907,0,ignore


In [8]:
metrics

{'total_entities': 11,
 'entities_with_predictions': 10,
 'annotation_rate': 0.9090909090909091,
 'total_predictions': 95,
 'matches': 2,
 'accuracy': 1.0,
 'total_time': 15.505661964416504,
 'llm_time': 8.263680934906006,
 'search_time': 7.165221929550171}

## Curate a model with existing annotations

Evaluation of a single model with existing annotations. 
Will only look at the species with existing annotations.

In [4]:
test_model_file = "test_models/BIOMD0000000190.xml"
# Check if test model exists
if os.path.exists(test_model_file):
    print(f"✓ Test model found: {test_model_file}")
else:
    print(f"✗ Test model not found: {test_model_file}")

✓ Test model found: test_models/BIOMD0000000190.xml


In [8]:
# Test with a single model
recommendations_df, metrics = curate_model(
    model_file=test_model_file,
    llm_model=llm_model,
    method="rag",
    max_entities=max_entities_per_model,
    entity_type=entity_type,
    database=database
)

2025-05-30 15:44:24,060 - INFO - Starting curation for model: test_models/BIOMD0000000190.xml
2025-05-30 15:44:24,061 - INFO - Using LLM model: meta-llama/llama-3.3-70b-instruct:free
2025-05-30 15:44:24,061 - INFO - Using method: rag for database search
2025-05-30 15:44:24,062 - INFO - Entity type: chemical, Database: chebi
2025-05-30 15:44:24,062 - INFO - Step 1: Finding existing annotations...
2025-05-30 15:44:24,078 - INFO - Found 11 entities with existing annotations
2025-05-30 15:44:24,078 - INFO - Selected 10 entities for evaluation
2025-05-30 15:44:24,078 - INFO - Step 3: Extracting model context...
2025-05-30 15:44:24,126 - INFO - Extracted context for model: Model_1
2025-05-30 15:44:24,126 - INFO - Step 4: Formatting LLM prompt...
2025-05-30 15:44:24,157 - INFO - Step 5: Querying LLM (meta-llama/llama-3.3-70b-instruct:free)...
2025-05-30 15:44:26,019 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 15:44:30,760 - INFO - LLM

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-30 15:44:31,329 - INFO - Database search completed in 0.57s
2025-05-30 15:44:31,330 - INFO - Step 8: Generating recommendation table...
2025-05-30 15:44:31,333 - INFO - Curation completed in 7.27s
2025-05-30 15:44:31,334 - INFO - Generated 50 recommendations


In [10]:
metrics

{'total_entities': 11,
 'entities_with_predictions': 10,
 'annotation_rate': 0.9090909090909091,
 'total_predictions': 50,
 'matches': 10,
 'accuracy': 1.0,
 'total_time': 7.271723985671997,
 'llm_time': 6.602231740951538,
 'search_time': 0.5678610801696777}

In [9]:
recommendations_df

Unnamed: 0,file,type,id,display_name,annotation,annotation_label,match_score,existing,update_annotation
0,BIOMD0000000190.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:15414,s-adenosylmethionine,1.0,1,keep
1,BIOMD0000000190.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:67040,s-adenosylmethionine,1.0,0,ignore
2,BIOMD0000000190.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:33442,s-adenosylmethionine,1.0,0,ignore
3,BIOMD0000000190.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:15625,s-adenosylmethioninamine,0.926394,0,ignore
4,BIOMD0000000190.xml,chemical,SAM,S-adenosyl-L-methionine,CHEBI:57443,s-adenosylmethioninaminium,0.806852,0,ignore
5,BIOMD0000000190.xml,chemical,A,S-adenosylmethioninamine,CHEBI:15625,decarboxylated s-adenosylmethionine,1.0,1,keep
6,BIOMD0000000190.xml,chemical,A,S-adenosylmethioninamine,CHEBI:33442,decarboxylated s-adenosylmethionine(2+),0.926394,0,ignore
7,BIOMD0000000190.xml,chemical,A,S-adenosylmethioninamine,CHEBI:15414,s-adenosylmethionine decarboxylase inhibitor,0.926394,0,ignore
8,BIOMD0000000190.xml,chemical,A,S-adenosylmethioninamine,CHEBI:67040,s-adenosylmethionine,0.926394,0,ignore
9,BIOMD0000000190.xml,chemical,A,S-adenosylmethioninamine,CHEBI:57443,s-adenosylmethionine,0.907145,0,ignore


## Test 1: Single Model Evaluation

Evaluation of a single model with existing annotations.

In [5]:
# Test using utils evaluation function
test_model_file = "/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels/BIOMD0000000013.xml"
result_df = evaluate_single_model(
    model_file=test_model_file,
    llm_model=llm_model,
    method = 'rag',
    max_entities=max_entities_per_model,
    entity_type=entity_type,
    database=database,
    save_llm_results=False,
    verbose=True
)

2025-06-02 22:47:31,942 - INFO - Evaluating model: BIOMD0000000013.xml
2025-06-02 22:47:31,973 - INFO - Evaluating 10 entities in BIOMD0000000013.xml
2025-06-02 22:47:36,047 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 22:47:36,075 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-06-02 22:47:36,342 - INFO - Using sentence transformer all-MiniLM-L6-v2 model
2025-06-02 22:47:37,993 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-06-02 22:47:39,415 - INFO - Using RAG embeddings from collection 'chebi_default' with default model


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
result_df

Unnamed: 0,model,species_id,display_name,synonyms_LLM,reason,exist_annotation_chebi,exist_annotation_name,predictions,predictions_names,match_score,recall_formula,precision_formula,recall_chebi,precision_chebi,accuracy,total_time,llm_time,query_time
0,BIOMD0000000013.xml,x_CO2,carbon dioxide,"[carbon dioxide, CO2, dioxide]",All are common biochemical metabolites involve...,[CHEBI:16526],carbon dioxide,"[CHEBI:16526, CHEBI:29325, CHEBI:48828, CHEBI:...","[carbon dioxide, oxidooxomethyl, cobalt(2+), p...","[1.0, 0.81, 0.803, 0.777, 0.676]",1.0,0.333,1.0,0.2,1,9.40949,3.988773,5.420717
1,BIOMD0000000013.xml,ATP_ch,ATP,"[ATP, adenosine triphosphate, adenosine tripho...",All are common biochemical metabolites involve...,[CHEBI:15422],ATP,"[CHEBI:15422, CHEBI:30616, CHEBI:62910, CHEBI:...","[ATP, ATP(4-), ethyl-ATP, P(1),P(3)-bis(5'-ade...","[1.0, 1.0, 0.865, 0.848, 0.737]",1.0,0.333,1.0,0.2,1,9.40949,3.988773,5.420717
2,BIOMD0000000013.xml,x_NADPH_ch,NADPH,"[NADPH, nicotinamide adenine dinucleotide phos...",All are common biochemical metabolites involve...,[CHEBI:16474],NADPH,"[CHEBI:57783, CHEBI:16474, CHEBI:16908, CHEBI:...","[NADPH(4-), NADPH, NADH, NADP, NADP(+)]","[1.0, 1.0, 0.909, 0.888, 0.881]",1.0,0.5,1.0,0.2,1,9.40949,3.988773,5.420717
3,BIOMD0000000013.xml,GAP_ch,glyceraldehyde-3-phosphate,"[glyceraldehyde-3-phosphate, G3P, triose phosp...",All are common biochemical metabolites involve...,[CHEBI:17138],glyceraldehyde 3-phosphate,"[CHEBI:17138, CHEBI:29052, CHEBI:58027, CHEBI:...","[glyceraldehyde 3-phosphate, D-glyceraldehyde ...","[1.0, 0.992, 0.992, 0.744, 0.711]",1.0,0.333,1.0,0.2,1,9.40949,3.988773,5.420717
4,BIOMD0000000013.xml,Pi_ch,inorganic phosphate,"[inorganic phosphate, Pi, phosphate]",All are common biochemical metabolites involve...,[CHEBI:18367],phosphate(3-),"[CHEBI:24838, CHEBI:28874, CHEBI:35780, CHEBI:...","[inorganic phosphate, phosphatidylinositol, ph...","[1.0, 1.0, 1.0, 1.0, 1.0]",0.0,0.0,0.0,0.0,0,9.40949,3.988773,5.420717
5,BIOMD0000000013.xml,DHAP_ch,dihydroxyacetone phosphate,"[dihydroxyacetone phosphate, DHAP, triose phos...",All are common biochemical metabolites involve...,[CHEBI:16108],dihydroxyacetone phosphate,"[CHEBI:16108, CHEBI:57642, CHEBI:15362, CHEBI:...","[dihydroxyacetone phosphate, glycerone phospha...","[1.0, 1.0, 0.801, 0.801, 0.744]",1.0,0.333,1.0,0.2,1,9.40949,3.988773,5.420717
6,BIOMD0000000013.xml,FBP_ch,"fructose-1,6-bisphosphate","[fructose-1,6-bisphosphate, FBP, fructose bisp...",All are common biochemical metabolites involve...,[CHEBI:28013],"beta-D-fructofuranose 1,6-bisphosphate","[CHEBI:16905, CHEBI:78682, CHEBI:49299, CHEBI:...","[keto-D-fructose 1,6-bisphosphate, D-fructose ...","[0.883, 0.883, 0.883, 0.836, 0.679]",1.0,1.0,0.0,0.0,1,9.40949,3.988773,5.420717
7,BIOMD0000000013.xml,F6P_ch,fructose-6-phosphate,"[fructose-6-phosphate, F6P, fructose phosphate]",All are common biochemical metabolites involve...,[CHEBI:57634],beta-D-fructofuranose 6-phosphate(2-),"[CHEBI:15946, CHEBI:16084, CHEBI:88003, CHEBI:...","[keto-D-fructose 6-phosphate, beta-D-fructofur...","[1.0, 1.0, 0.987, 0.899, 0.877]",1.0,0.5,0.0,0.0,1,9.40949,3.988773,5.420717
8,BIOMD0000000013.xml,E4P_ch,erythrose-4-phosphate,"[erythrose-4-phosphate, E4P, tetrose phosphate]",All are common biochemical metabolites involve...,[CHEBI:48153],D-erythrose 4-phosphate,"[CHEBI:4116, CHEBI:23956, CHEBI:16897, CHEBI:4...","[D-erythrulose 4-phosphate, erythrose phosphat...","[0.837, 0.825, 0.784, 0.784, 0.655]",1.0,1.0,1.0,0.2,1,9.40949,3.988773,5.420717
9,BIOMD0000000013.xml,X5P_ch,xylulose-5-phosphate,"[xylulose-5-phosphate, X5P, pentose phosphate]",All are common biochemical metabolites involve...,[CHEBI:16332],D-xylulose 5-phosphate,"[CHEBI:84055, CHEBI:27354, CHEBI:16593, CHEBI:...","[pentose phosphate, xylulose 5-phosphate, L-xy...","[1.0, 0.988, 0.863, 0.863, 0.824]",1.0,1.0,1.0,0.2,1,9.40949,3.988773,5.420717


## Test 2: Batch Model Evaluation

Test the evaluation of multiple models in a directory.

In [3]:
model_dir = "/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels"
# model_dir = "test_models"
# Check if model directory exists
if os.path.exists(model_dir):
    model_files = [f for f in os.listdir(model_dir) if f.endswith('.xml')]
    print(f"✓ Model directory found: {model_dir}")
    print(f"  - Found {len(model_files)} XML files")
    # print(f"  - Will test first {min(num_models_to_test, len(model_files))} models")

✓ Model directory found: /Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels
  - Found 1063 XML files


In [None]:
# Run batch evaluation 
batch_results_df = evaluate_models_in_folder(
    model_dir=model_dir,
    # num_models=min(num_models_to_test, len(model_files)),
    llm_model=llm_model,
    method="rag",
    # max_entities=max_entities_per_model,
    entity_type=entity_type,
    database=database,
    save_llm_results=True,
    output_dir=output_dir,
    output_file="biomd_chebi_rag_llama-extended_default.csv",
    start_at=1,
    verbose=False
)
        
if not batch_results_df.empty:
    print(f"✓ Batch evaluation successful")
    print(f"  - Evaluated {batch_results_df['model'].nunique()} models")
    print(f"  - Generated {len(batch_results_df)} total result rows")
    print(f"  - Average accuracy: {batch_results_df['accuracy'].mean():.1%}")
    
    # Show updated metrics
    print("\n  Updated metrics summary:")
    if 'recall_formula' in batch_results_df.columns:
        print(f"  - Average recall (formula): {batch_results_df['recall_formula'].mean():.3f}")
        print(f"  - Average precision (formula): {batch_results_df['precision_formula'].mean():.3f}")
    print(f"  - Average recall (ChEBI): {batch_results_df['recall_chebi'].mean():.3f}")
    print(f"  - Average precision (ChEBI): {batch_results_df['precision_chebi'].mean():.3f}")
    
    # Show sample of LLM results
    print("\n  Sample LLM results:")
    if 'synonyms_LLM' in batch_results_df.columns:
        for idx, row in batch_results_df[['species_id', 'synonyms_LLM']].head(3).iterrows():
            print(f"    {row['species_id']}: {row['synonyms_LLM']}")
    
    # Show match scores instead of predictions_hits
    print("\n  Sample match scores:")
    if 'match_score' in batch_results_df.columns:
        for idx, row in batch_results_df[['species_id', 'match_score']].head(3).iterrows():
            print(f"    {row['species_id']}: {row['match_score']}")
    
else:
    print(f"✗ Batch evaluation failed: No results generated")

## Test 3: Evaluating previous LLM sysnonyms

In [3]:
results_df = process_saved_llm_responses(response_folder = '/Users/luna/Desktop/CRBM/AMAS_proj/AAAIM/tests/results/llama-3.3-70b-instruct', 
                               model_dir = '/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels', 
                               prev_results_csv = 'results/biomd_chebi_rag_llama_default.csv', 
                               method = "direct",
                               output_dir = './results/', 
                               output_file = 'biomd_chebi_direct_llama_nosymbols.csv',
                               verbose = False)

Processing 1/343: BIOMD0000000291.txt
Processing 2/343: BIOMD0000000246.txt
Processing 3/343: BIOMD0000000508.txt
Processing 4/343: BIOMD0000000087.txt
Processing 5/343: BIOMD0000000736.txt
Processing 6/343: BIOMD0000000050.txt
Processing 7/343: BIOMD0000000044.txt
Processing 8/343: BIOMD0000000859.txt
Processing 9/343: BIOMD0000000642.txt
Processing 10/343: BIOMD0000000124.txt
Processing 11/343: BIOMD0000000497.txt
Processing 12/343: BIOMD0000000326.txt
Processing 13/343: BIOMD0000000468.txt
Processing 14/343: BIOMD0000000469.txt
Processing 15/343: BIOMD0000001005.txt
Processing 16/343: BIOMD0000000327.txt
Processing 17/343: BIOMD0000000496.txt
Processing 18/343: BIOMD0000000482.txt
Processing 19/343: BIOMD0000000119.txt
Processing 20/343: BIOMD0000000858.txt
Processing 21/343: BIOMD0000000045.txt
Processing 22/343: BIOMD0000000737.txt
Processing 23/343: BIOMD0000000051.txt
Processing 24/343: BIOMD0000000086.txt
Processing 25/343: BIOMD0000000509.txt
Processing 26/343: BIOMD0000000521

## Statistics

In [5]:
print_evaluation_results('results/biomd_chebi_direct_llama_nosymbols.csv') 

Number of models assessed: 315
Number of models with predictions: 307
Average accuracy (per model): 0.82
Ave. total time (per model): 17.73
Ave. total time (per element, per model): 0.58
Ave. LLM time (per model): 16.42
Ave. LLM time (per element, per model): 0.54
Average number of predictions per species: 1.55


In [4]:
print_evaluation_results('results/biomd_chebi_rag_llama_default.csv')

Number of models assessed: 340
Number of models with predictions: 338
Average accuracy (per model): 0.78
Ave. total time (per model): 24.09
Ave. total time (per element, per model): 0.76
Ave. LLM time (per model): 23.44
Ave. LLM time (per element, per model): 0.74
Average number of predictions per species: 4.64


In [6]:
print_evaluation_results('results/biomd_chebi_rag_gpt-4.1-nano_default.csv')

Number of models assessed: 340
Number of models with predictions: 334
Average accuracy (per model): 0.74
Ave. total time (per model): 6.19
Ave. total time (per element, per model): 0.20
Ave. LLM time (per model): 5.60
Ave. LLM time (per element, per model): 0.18
Average number of predictions per species: 4.81


In [4]:
compare_results('results/biomd_chebi_direct_llama_nosymbols.csv','results/biomd_chebi_rag_llama_default.csv','results/biomd_chebi_rag_gpt-4.1-nano_default.csv')

COMPARISON OF RESULTS (filtered to common models/species)

RESULTS FOR: results/biomd_chebi_direct_llama_nosymbols.csv
Number of models assessed: 315
Number of models with predictions: 307
Average accuracy (per model): 0.82
Ave. total time (per model): 17.73
Ave. total time (per element, per model): 0.58
Ave. LLM time (per model): 16.42
Ave. LLM time (per element, per model): 0.54
Average number of predictions per species: 1.55

RESULTS FOR: results/biomd_chebi_rag_llama_default.csv
Number of models assessed: 315
Number of models with predictions: 313
Average accuracy (per model): 0.86
Ave. total time (per model): 25.20
Ave. total time (per element, per model): 0.82
Ave. LLM time (per model): 24.52
Ave. LLM time (per element, per model): 0.80
Average number of predictions per species: 4.71

RESULTS FOR: results/biomd_chebi_rag_gpt-4.1-nano_default.csv
Number of models assessed: 315
Number of models with predictions: 309
Average accuracy (per model): 0.82
Ave. total time (per model): 6.

In [9]:
gpt_df_filtered.to_csv('results/biomd_chebi_rag_gpt-4.1-nano_default_filtered.csv', index=False)

In [7]:
prev_df = pd.read_csv('results/biomd_chebi_rag_llama_default.csv')
prev_models = set(prev_df['model'].unique())
new_models = set(batch_results_df['model'].unique())
new_models = new_models - prev_models
new_models

{'BIOMD0000000019.xml',
 'BIOMD0000000049.xml',
 'BIOMD0000000072.xml',
 'BIOMD0000000164.xml',
 'BIOMD0000000174.xml',
 'BIOMD0000000201.xml',
 'BIOMD0000000264.xml',
 'BIOMD0000000338.xml',
 'BIOMD0000000339.xml',
 'BIOMD0000000340.xml',
 'BIOMD0000000344.xml',
 'BIOMD0000000399.xml',
 'BIOMD0000000452.xml',
 'BIOMD0000000453.xml',
 'BIOMD0000000466.xml',
 'BIOMD0000000521.xml',
 'BIOMD0000000617.xml',
 'BIOMD0000000637.xml',
 'BIOMD0000000638.xml',
 'BIOMD0000000699.xml',
 'BIOMD0000000701.xml',
 'BIOMD0000000702.xml',
 'BIOMD0000000826.xml',
 'BIOMD0000000927.xml',
 'BIOMD0000000932.xml'}